Spaces:

RaviNaik
/

ERA-SESSION22

Sleeping

App Files Files Community

ravi.naik commited on Dec 13, 2023

Commit

17a7426

•

1 Parent(s): ac0ad3c

Updated repository for gradio UI and model

Browse files

Files changed (42) hide show

.gitignore +160 -0
README.md +1 -3
app.py +69 -0
generate/adapter.py +141 -0
generate/adapter_v2.py +141 -0
generate/base.py +268 -0
generate/full.py +137 -0
generate/lora.py +163 -0
generate_test.ipynb +754 -0
generation_config.json +10 -0
lit_config.json +1 -0
lit_gpt/__init__.py +22 -0
lit_gpt/adapter.py +165 -0
lit_gpt/adapter_v2.py +197 -0
lit_gpt/config.py +1203 -0
lit_gpt/lora.py +659 -0
lit_gpt/model.py +345 -0
lit_gpt/packed_dataset.py +237 -0
lit_gpt/rmsnorm.py +26 -0
lit_gpt/tokenizer.py +107 -0
lit_gpt/utils.py +351 -0
main.ipynb +714 -0
out/redpajama/iter-003999-ckpt.pth +3 -0
out/redpajama/iter-007999-ckpt.pth +3 -0
out/redpajama/iter-011999-ckpt.pth +3 -0
out/redpajama/lit_config.json +1 -0
out/redpajama/lit_model.pth +3 -0
out/redpajama/lit_model2.pth +3 -0
out/redpajama/tokenizer.json +0 -0
out/redpajama/tokenizer.model +3 -0
out/redpajama/tokenizer_config.json +36 -0
out/redpajama/version_1/metrics.csv +0 -0
requirements.txt +5 -0
tokenizer_config.json +36 -0
tsai_gpt/__init__.py +15 -0
tsai_gpt/config.py +1181 -0
tsai_gpt/model.py +342 -0
tsai_gpt/packed_dataset.py +235 -0
tsai_gpt/rmsnorm.py +26 -0
tsai_gpt/speed_monitor.py +425 -0
tsai_gpt/tokenizer.py +103 -0
tsai_gpt/utils.py +399 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: ERA SESSION22
 emoji: 📈
 colorFrom: indigo
 colorTo: yellow
@@ -9,5 +9,3 @@ app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: "ERA-SESSION22 Training PyThia-160M from scratch on AWS Sagemaker"
 emoji: 📈
 colorFrom: indigo
 colorTo: yellow
 pinned: false
 license: mit
 ---

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import gradio as gr
+import torch
+from pathlib import Path
+torch.set_float32_matmul_precision("high")
+from generate.base import main
+def generate(prompt, max_new_tokens, temperature, num_samples):
+    prompt = prompt.strip()
+    responses = main(
+        prompt=prompt,
+        checkpoint_dir=Path("out/redpajama"),
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        num_samples=num_samples,
+    )
+    return {output: responses}
+with gr.Blocks() as app:
+    gr.Markdown("## ERA Session22 - Pythia-160M Pre-training with LitGPT")
+    gr.Markdown(
+        """This is an implementation of Pythia-160M using [LitGPT](https://github.com/Lightning-AI/lit-gpt) by LightningAI.
+        Please find the source code and training details [here](https://github.com/RaviNaik/ERA-SESSION22).
+        Dataset used to train: [RedPajama](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T).
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            prompt_box = gr.Textbox(label="Initial Prompt", interactive=True)
+            max_new_tokens = gr.Slider(
+                minimum=10,
+                maximum=200,
+                value=50,
+                step=10,
+                label="Select Number of Tokens to be Generated",
+                interactive=True,
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=1,
+                value=0.7,
+                step=0.1,
+                label="Select Temperature",
+                interactive=True,
+            )
+            num_samples = gr.Dropdown(
+                choices=[1, 2, 5, 10],
+                value=1,
+                interactive=True,
+                label="Select No. of outputs to be generated",
+            )
+            submit_btn = gr.Button(value="Generate")
+        with gr.Column():
+            output = gr.JSON(label="Generated Text")
+        submit_btn.click(
+            generate,
+            inputs=[prompt_box, max_new_tokens, temperature, num_samples],
+            outputs=[output],
+        )
+app.launch()

generate/adapter.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import sys
+import time
+from pathlib import Path
+from typing import Literal, Optional
+import lightning as L
+import torch
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.strategies import FSDPStrategy
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from generate.base import generate
+from lit_gpt import Tokenizer
+from lit_gpt.adapter import GPT, Block, Config
+from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, gptq_quantization, lazy_load
+from scripts.prepare_alpaca import generate_prompt
+def main(
+    prompt: str = "What food do llamas eat?",
+    input: str = "",
+    adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"),
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    max_new_tokens: int = 100,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.8,
+    strategy: str = "auto",
+    devices: int = 1,
+    precision: Optional[str] = None,
+) -> None:
+    """Generates a response based on a given instruction and an optional input.
+    This script will only work with checkpoints from the instruction-tuned GPT-Adapter model.
+    See `finetune/adapter.py`.
+    Args:
+        prompt: The prompt/instruction (Alpaca style).
+        input: Optional input (Alpaca style).
+        adapter_path: Path to the checkpoint with trained adapter weights, which are the output of
+            `finetune/adapter.py`.
+        checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights.
+        quantize: Whether to quantize the model and using which method:
+            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
+            - bnb.int8: 8-bit quantization from bitsandbytes
+            - gptq.int4: 4-bit quantization from GPTQ
+            for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        strategy: Indicates the Fabric strategy setting to use.
+        devices: How many devices to use.
+        precision: Indicates the Fabric precision setting to use.
+    """
+    precision = precision or get_default_supported_precision(training=False)
+    plugins = None
+    if quantize is not None:
+        if devices > 1:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
+                " --quantize flag."
+            )
+        if quantize.startswith("bnb."):
+            if "mixed" in precision:
+                raise ValueError("Quantization and mixed precision is not supported.")
+            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+            plugins = BitsandbytesPrecision(quantize[4:], dtype)
+            precision = None
+    if strategy == "fsdp":
+        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
+    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    fabric.launch()
+    check_valid_checkpoint_dir(checkpoint_dir)
+    config = Config.from_json(checkpoint_dir / "lit_config.json")
+    if quantize is not None and devices > 1:
+        raise NotImplementedError
+    if quantize == "gptq.int4":
+        model_file = "lit_model_gptq.4bit.pth"
+        if not (checkpoint_dir / model_file).is_file():
+            raise ValueError("Please run `python quantize/gptq.py` first")
+    else:
+        model_file = "lit_model.pth"
+    checkpoint_path = checkpoint_dir / model_file
+    tokenizer = Tokenizer(checkpoint_dir)
+    sample = {"instruction": prompt, "input": input}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
+    t0 = time.perf_counter()
+    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+        model = GPT(config)
+    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
+    t0 = time.perf_counter()
+    checkpoint = lazy_load(checkpoint_path)
+    adapter_checkpoint = lazy_load(adapter_path)
+    checkpoint.update(adapter_checkpoint.get("model", adapter_checkpoint))
+    model.load_state_dict(checkpoint)
+    fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    model = fabric.setup(model)
+    L.seed_everything(1234)
+    t0 = time.perf_counter()
+    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+    t = time.perf_counter() - t0
+    output = tokenizer.decode(y)
+    output = output.split("### Response:")[1].strip()
+    fabric.print(output)
+    tokens_generated = y.size(0) - prompt_length
+    fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr)
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    torch.set_float32_matmul_precision("high")
+    CLI(main)

generate/adapter_v2.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import sys
+import time
+from pathlib import Path
+from typing import Literal, Optional
+import lightning as L
+import torch
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.strategies import FSDPStrategy
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from generate.base import generate
+from lit_gpt import Tokenizer
+from lit_gpt.adapter_v2 import GPT, Block, Config
+from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, gptq_quantization, lazy_load
+from scripts.prepare_alpaca import generate_prompt
+def main(
+    prompt: str = "What food do llamas eat?",
+    input: str = "",
+    adapter_path: Path = Path("out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth"),
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    max_new_tokens: int = 100,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.8,
+    strategy: str = "auto",
+    devices: int = 1,
+    precision: Optional[str] = None,
+) -> None:
+    """Generates a response based on a given instruction and an optional input.
+    This script will only work with checkpoints from the instruction-tuned GPT-AdapterV2 model.
+    See `finetune/adapter_v2.py`.
+    Args:
+        prompt: The prompt/instruction (Alpaca style).
+        input: Optional input (Alpaca style).
+        adapter_path: Path to the checkpoint with trained adapter weights, which are the output of
+            `finetune/adapter_v2.py`.
+        checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights.
+        quantize: Whether to quantize the model and using which method:
+            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
+            - bnb.int8: 8-bit quantization from bitsandbytes
+            - gptq.int4: 4-bit quantization from GPTQ
+            for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        strategy: Indicates the Fabric strategy setting to use.
+        devices: How many devices to use.
+        precision: Indicates the Fabric precision setting to use.
+    """
+    precision = precision or get_default_supported_precision(training=False)
+    plugins = None
+    if quantize is not None:
+        if devices > 1:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
+                " --quantize flag."
+            )
+        if quantize.startswith("bnb."):
+            if "mixed" in precision:
+                raise ValueError("Quantization and mixed precision is not supported.")
+            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+            plugins = BitsandbytesPrecision(quantize[4:], dtype)
+            precision = None
+    if strategy == "fsdp":
+        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
+    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    fabric.launch()
+    check_valid_checkpoint_dir(checkpoint_dir)
+    config = Config.from_json(checkpoint_dir / "lit_config.json")
+    if quantize is not None and devices > 1:
+        raise NotImplementedError
+    if quantize == "gptq.int4":
+        model_file = "lit_model_gptq.4bit.pth"
+        if not (checkpoint_dir / model_file).is_file():
+            raise ValueError("Please run `python quantize/gptq.py` first")
+    else:
+        model_file = "lit_model.pth"
+    checkpoint_path = checkpoint_dir / model_file
+    tokenizer = Tokenizer(checkpoint_dir)
+    sample = {"instruction": prompt, "input": input}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
+    t0 = time.perf_counter()
+    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+        model = GPT(config)
+    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
+    t0 = time.perf_counter()
+    checkpoint = lazy_load(checkpoint_path)
+    adapter_checkpoint = lazy_load(adapter_path)
+    checkpoint.update(adapter_checkpoint.get("model", adapter_checkpoint))
+    model.load_state_dict(checkpoint)
+    fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    model = fabric.setup(model)
+    L.seed_everything(1234)
+    t0 = time.perf_counter()
+    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+    t = time.perf_counter() - t0
+    output = tokenizer.decode(y)
+    output = output.split("### Response:")[1].strip()
+    fabric.print(output)
+    tokens_generated = y.size(0) - prompt_length
+    fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr)
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    torch.set_float32_matmul_precision("high")
+    CLI(main)

generate/base.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import sys
+import time
+from pathlib import Path
+from typing import Any, Literal, Optional
+import lightning as L
+import torch
+import torch._dynamo.config
+import torch._inductor.config
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.strategies import FSDPStrategy
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from lit_gpt import GPT, Config, Tokenizer
+from lit_gpt.model import Block
+from lit_gpt.utils import (
+    check_valid_checkpoint_dir,
+    get_default_supported_precision,
+    gptq_quantization,
+    load_checkpoint,
+)
+def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor:
+    if torch._dynamo.is_compiling():
+        # Faster alternative to `torch.multinomial(probs, num_samples=1)` that is also CUDAGraph friendly
+        distribution = torch.empty_like(probs).exponential_(1)
+        return torch.argmax(probs / distribution, dim=-1, keepdim=True)
+    return torch.multinomial(probs, num_samples=1)
+def sample(
+    logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None
+) -> torch.Tensor:
+    logits = logits[0, -1]
+    # optionally crop the logits to only the top k options
+    if top_k is not None:
+        v, i = torch.topk(logits, min(top_k, logits.size(-1)))
+        # do not use `torch.where` as in nanogpt because it will repeat top-k collisions
+        logits = torch.full_like(logits, float("-inf")).scatter_(-1, i, v)
+    # optionally scale the logits and sample from a probability distribution
+    if temperature > 0.0:
+        probs = torch.nn.functional.softmax(logits / temperature, dim=-1)
+        return multinomial_num_samples_1(probs)
+    return torch.argmax(logits, dim=-1, keepdim=True)
+def next_token(
+    model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any
+) -> torch.Tensor:
+    logits = model(x, input_pos)
+    next = sample(logits, **kwargs)
+    return next.type_as(x)
+@torch.inference_mode()
+def generate(
+    model: GPT,
+    prompt: torch.Tensor,
+    max_returned_tokens: int,
+    *,
+    temperature: float = 1.0,
+    top_k: Optional[int] = None,
+    eos_id: Optional[int] = None,
+) -> torch.Tensor:
+    """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+    The implementation of this function is modified from A. Karpathy's nanoGPT.
+    Args:
+        model: The model to use.
+        prompt: Tensor of shape (T) with indices of the prompt sequence.
+        max_returned_tokens: The maximum number of tokens to return (given plus generated).
+        temperature: Scales the predicted logits by 1 / temperature.
+        top_k: If specified, only sample among the tokens with the k highest probabilities.
+        eos_id: If specified, stop generating any more token once the <eos> token is triggered.
+    """
+    T = prompt.size(0)
+    assert max_returned_tokens > T
+    if model.max_seq_length < max_returned_tokens - 1:
+        # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a
+        # data dependency on the `input_pos` tensor and impact model compilation. Since this setting is uncommon, we do
+        # not support it to avoid negatively impacting the overall speed
+        raise NotImplementedError(
+            f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}"
+        )
+    device = prompt.device
+    tokens = [prompt]
+    input_pos = torch.tensor([T], device=device)
+    token = next_token(
+        model,
+        torch.arange(0, T, device=device),
+        prompt.view(1, -1),
+        temperature=temperature,
+        top_k=top_k,
+    ).clone()
+    tokens.append(token)
+    for _ in range(2, max_returned_tokens - T + 1):
+        token = next_token(
+            model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k
+        ).clone()
+        tokens.append(token)
+        if token == eos_id:
+            break
+        input_pos = input_pos.add_(1)
+    return torch.cat(tokens)
+def main(
+    prompt: str = "What food do llamas eat?",
+    *,
+    num_samples: int = 1,
+    max_new_tokens: int = 50,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.8,
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    quantize: Optional[
+        Literal[
+            "bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"
+        ]
+    ] = None,
+    strategy: str = "auto",
+    devices: int = 1,
+    precision: Optional[str] = None,
+    compile: bool = False,
+) -> None:
+    """Generates text samples based on a pre-trained model and tokenizer.
+    Args:
+        prompt: The prompt string to use for generating the samples.
+        num_samples: The number of text samples to generate.
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        checkpoint_dir: The checkpoint directory to load.
+        quantize: Whether to quantize the model and using which method:
+            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
+            - bnb.int8: 8-bit quantization from bitsandbytes
+            - gptq.int4: 4-bit quantization from GPTQ
+            for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
+        strategy: Indicates the Fabric strategy setting to use.
+        devices: How many devices to use.
+        precision: Indicates the Fabric precision setting to use.
+        compile: Whether to compile the model.
+    """
+    precision = precision or get_default_supported_precision(training=False)
+    plugins = None
+    if quantize is not None:
+        if devices > 1:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
+                " --quantize flag."
+            )
+        if quantize.startswith("bnb."):
+            if "mixed" in precision:
+                raise ValueError("Quantization and mixed precision is not supported.")
+            dtype = {
+                "16-true": torch.float16,
+                "bf16-true": torch.bfloat16,
+                "32-true": torch.float32,
+            }[precision]
+            plugins = BitsandbytesPrecision(quantize[4:], dtype)
+            precision = None
+    if strategy == "fsdp":
+        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
+    fabric = L.Fabric(
+        devices=devices, precision=precision, strategy=strategy, plugins=plugins
+    )
+    fabric.launch()
+    check_valid_checkpoint_dir(checkpoint_dir)
+    config = Config.from_json(checkpoint_dir / "lit_config.json")
+    if quantize == "gptq.int4":
+        model_file = "lit_model_gptq.4bit.pth"
+        if not (checkpoint_dir / model_file).is_file():
+            raise ValueError("Please run `python quantize/gptq.py` first")
+    else:
+        model_file = "lit_model.pth"
+    checkpoint_path = checkpoint_dir / model_file
+    tokenizer = Tokenizer(checkpoint_dir)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+    fabric.print(
+        f"Loading model {str(checkpoint_path)!r} with {config.__dict__}",
+        file=sys.stderr,
+    )
+    t0 = time.perf_counter()
+    with fabric.init_module(empty_init=True), gptq_quantization(
+        quantize == "gptq.int4"
+    ):
+        model = GPT(config)
+    fabric.print(
+        f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.",
+        file=sys.stderr,
+    )
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
+    if compile:
+        torch._dynamo.config.automatic_dynamic_shapes = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        global next_token
+        next_token = torch.compile(next_token, mode="reduce-overhead")
+    model = fabric.setup_module(model)
+    t0 = time.perf_counter()
+    load_checkpoint(fabric, model, checkpoint_path)
+    fabric.print(
+        f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.",
+        file=sys.stderr,
+    )
+    L.seed_everything(1234)
+    responses = []
+    for i in range(num_samples):
+        t0 = time.perf_counter()
+        y = generate(
+            model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k
+        )
+        t = time.perf_counter() - t0
+        for block in model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
+        fabric.print(tokenizer.decode(y))
+        tokens_generated = y.size(0) - prompt_length
+        fabric.print(
+            f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec",
+            file=sys.stderr,
+        )
+        responses.append(
+            {
+                "response": tokenizer.decode(y),
+                "latency": f"{round(t, 2)} seconds",
+                "generation_rate": f"{round(tokens_generated / t, 2)} tokens per sec",
+            }
+        )
+    if fabric.device.type == "cuda":
+        fabric.print(
+            f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB",
+            file=sys.stderr,
+        )
+    return responses
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    torch.set_float32_matmul_precision("high")
+    CLI(main)

generate/full.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import sys
+import time
+from pathlib import Path
+from typing import Literal, Optional
+import lightning as L
+import torch
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.strategies import FSDPStrategy
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from generate.base import generate
+from lit_gpt import GPT, Config, Tokenizer
+from lit_gpt.model import Block
+from lit_gpt.utils import (
+    check_valid_checkpoint_dir,
+    get_default_supported_precision,
+    gptq_quantization,
+    load_checkpoint,
+)
+from scripts.prepare_alpaca import generate_prompt
+def main(
+    prompt: str = "What food do llamas eat?",
+    input: str = "",
+    finetuned_path: Path = Path("out/full/alpaca/lit_model_finetuned.pth"),
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    max_new_tokens: int = 100,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.8,
+    strategy: str = "auto",
+    devices: int = 1,
+    precision: Optional[str] = None,
+) -> None:
+    """Generates a response based on a given instruction and an optional input.
+    This script will only work with checkpoints from the instruction-tuned GPT model.
+    See `finetune/full.py`.
+    Args:
+        prompt: The prompt/instruction (Alpaca style).
+        input: Optional input (Alpaca style).
+        finetuned_path: Path to the checkpoint with trained weights, which are the output of
+            `finetune/full.py`.
+        checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights.
+        quantize: Whether to quantize the model and using which method:
+            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
+            - bnb.int8: 8-bit quantization from bitsandbytes
+            - gptq.int4: 4-bit quantization from GPTQ
+            for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        strategy: Indicates the Fabric strategy setting to use.
+        devices: How many devices to use.
+        precision: Indicates the Fabric precision setting to use.
+    """
+    precision = precision or get_default_supported_precision(training=False)
+    plugins = None
+    if quantize is not None:
+        if devices > 1:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
+                " --quantize flag."
+            )
+        if quantize.startswith("bnb."):
+            if "mixed" in precision:
+                raise ValueError("Quantization and mixed precision is not supported.")
+            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+            plugins = BitsandbytesPrecision(quantize[4:], dtype)
+            precision = None
+    if strategy == "fsdp":
+        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
+    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    fabric.launch()
+    check_valid_checkpoint_dir(checkpoint_dir)
+    config = Config.from_json(checkpoint_dir / "lit_config.json")
+    if quantize is not None and devices > 1:
+        raise NotImplementedError
+    checkpoint_path = finetuned_path
+    tokenizer = Tokenizer(checkpoint_dir)
+    sample = {"instruction": prompt, "input": input}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
+    t0 = time.perf_counter()
+    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+        model = GPT(config)
+    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
+    model = fabric.setup(model)
+    t0 = time.perf_counter()
+    load_checkpoint(fabric, model, checkpoint_path)
+    fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    L.seed_everything(1234)
+    t0 = time.perf_counter()
+    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+    t = time.perf_counter() - t0
+    output = tokenizer.decode(y)
+    output = output.split("### Response:")[1].strip()
+    fabric.print(output)
+    tokens_generated = y.size(0) - prompt_length
+    fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr)
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    torch.set_float32_matmul_precision("high")
+    CLI(main)

generate/lora.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import sys
+import time
+from pathlib import Path
+from typing import Literal, Optional
+import lightning as L
+import torch
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.strategies import FSDPStrategy
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from generate.base import generate
+from lit_gpt import Tokenizer
+from lit_gpt.lora import GPT, Block, Config, merge_lora_weights
+from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, gptq_quantization, lazy_load
+from scripts.prepare_alpaca import generate_prompt
+lora_r = 8
+lora_alpha = 16
+lora_dropout = 0.05
+lora_query = True
+lora_key = False
+lora_value = True
+lora_projection = False
+lora_mlp = False
+lora_head = False
+def main(
+    prompt: str = "What food do llamas eat?",
+    input: str = "",
+    lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"),
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    max_new_tokens: int = 100,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.8,
+    strategy: str = "auto",
+    devices: int = 1,
+    precision: Optional[str] = None,
+) -> None:
+    """Generates a response based on a given instruction and an optional input.
+    This script will only work with checkpoints from the instruction-tuned GPT-LoRA model.
+    See `finetune/lora.py`.
+    Args:
+        prompt: The prompt/instruction (Alpaca style).
+        input: Optional input (Alpaca style).
+        lora_path: Path to the checkpoint with trained adapter weights, which are the output of
+            `finetune/lora.py`.
+        checkpoint_dir: The path to the checkpoint folder with pretrained GPT weights.
+        quantize: Whether to quantize the model and using which method:
+            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
+            - bnb.int8: 8-bit quantization from bitsandbytes
+            - gptq.int4: 4-bit quantization from GPTQ
+            for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        strategy: Indicates the Fabric strategy setting to use.
+        devices: How many devices to use.
+        precision: Indicates the Fabric precision setting to use.
+    """
+    precision = precision or get_default_supported_precision(training=False)
+    plugins = None
+    if quantize is not None:
+        if devices > 1:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
+                " --quantize flag."
+            )
+        if quantize.startswith("bnb."):
+            if "mixed" in precision:
+                raise ValueError("Quantization and mixed precision is not supported.")
+            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+            plugins = BitsandbytesPrecision(quantize[4:], dtype)
+            precision = None
+    if strategy == "fsdp":
+        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
+    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    fabric.launch()
+    check_valid_checkpoint_dir(checkpoint_dir)
+    config = Config.from_json(
+        checkpoint_dir / "lit_config.json",
+        r=lora_r,
+        alpha=lora_alpha,
+        dropout=lora_dropout,
+        to_query=lora_query,
+        to_key=lora_key,
+        to_value=lora_value,
+        to_projection=lora_projection,
+        to_mlp=lora_mlp,
+        to_head=lora_head,
+    )
+    if quantize is not None and devices > 1:
+        raise NotImplementedError
+    if quantize == "gptq.int4":
+        model_file = "lit_model_gptq.4bit.pth"
+        if not (checkpoint_dir / model_file).is_file():
+            raise ValueError("Please run `python quantize/gptq.py` first")
+    else:
+        model_file = "lit_model.pth"
+    checkpoint_path = checkpoint_dir / model_file
+    tokenizer = Tokenizer(checkpoint_dir)
+    sample = {"instruction": prompt, "input": input}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
+    t0 = time.perf_counter()
+    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+        model = GPT(config)
+    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
+    t0 = time.perf_counter()
+    checkpoint = lazy_load(checkpoint_path)
+    lora_checkpoint = lazy_load(lora_path)
+    checkpoint.update(lora_checkpoint.get("model", lora_checkpoint))
+    model.load_state_dict(checkpoint)
+    fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    merge_lora_weights(model)
+    model = fabric.setup(model)
+    L.seed_everything(1234)
+    t0 = time.perf_counter()
+    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
+    t = time.perf_counter() - t0
+    output = tokenizer.decode(y)
+    output = output.split("### Response:")[1].strip()
+    fabric.print(output)
+    tokens_generated = y.size(0) - prompt_length
+    fabric.print(f"\n\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr)
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    torch.set_float32_matmul_precision("high")
+    CLI(main)

generate_test.ipynb ADDED Viewed

	@@ -0,0 +1,754 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "torch.cuda.is_available()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import math\n",
+    "import sys\n",
+    "import time\n",
+    "from pathlib import Path\n",
+    "from typing import Optional, Tuple, Union\n",
+    "\n",
+    "import lightning as L\n",
+    "import torch\n",
+    "from lightning.fabric.loggers import CSVLogger\n",
+    "from lightning.fabric.strategies import FSDPStrategy\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "# # support running without installing as a package\n",
+    "# wd = Path(__file__).parent.parent.resolve()\n",
+    "# sys.path.append(str(wd))\n",
+    "\n",
+    "from tsai_gpt.model import GPT, Block, Config\n",
+    "from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset\n",
+    "from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops\n",
+    "from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor\n",
+    "from tsai_gpt.utils import (\n",
+    "    chunked_cross_entropy,\n",
+    "    get_default_supported_precision,\n",
+    "    num_parameters,\n",
+    "    load_checkpoint,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"pythia-160m\"\n",
+    "name = \"redpajama\"\n",
+    "out_dir = Path(\"out\") / name\n",
+    "save_interval = 1000\n",
+    "eval_interval = 1000\n",
+    "eval_iters = 100\n",
+    "log_interval = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Hyperparameters\n",
+    "learning_rate = 6e-3\n",
+    "batch_size = 32\n",
+    "micro_batch_size = 8\n",
+    "gradient_accumulation_steps = batch_size // micro_batch_size\n",
+    "assert gradient_accumulation_steps > 0\n",
+    "# max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices\n",
+    "max_iters = 15000\n",
+    "weight_decay = 1e-1\n",
+    "beta1 = 0.9\n",
+    "beta2 = 0.95\n",
+    "grad_clip = 1.0\n",
+    "decay_lr = True\n",
+    "warmup_iters = 2000\n",
+    "lr_decay_iters = max_iters\n",
+    "min_lr = 6e-6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data proportions from https://arxiv.org/pdf/2302.13971.pdf Table 1\n",
+    "data_config = [\n",
+    "    (\"arxiv\", 2.5),\n",
+    "    (\"book\", 4.5),\n",
+    "    (\"c4\", 15.0),\n",
+    "    (\"cc\", 67.0),\n",
+    "    (\"github\", 4.5),\n",
+    "    (\"stackexchange\", 2.0),\n",
+    "    (\"wikipedia\", 4.5),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hparams = {\n",
+    "    k: v\n",
+    "    for k, v in locals().items()\n",
+    "    if isinstance(v, (int, float, str)) and not k.startswith(\"_\")\n",
+    "}\n",
+    "logger = CSVLogger(\"out\", name, flush_logs_every_n_steps=log_interval)\n",
+    "\n",
+    "\n",
+    "def setup(\n",
+    "    devices: int = 4,\n",
+    "    train_data_dir: Path = Path(\"data/redpajama_sample\"),\n",
+    "    val_data_dir: Optional[Path] = None,\n",
+    "    precision: Optional[str] = None,\n",
+    "    resume: Union[bool, Path] = False,\n",
+    ") -> None:\n",
+    "    precision = precision or get_default_supported_precision(training=True)\n",
+    "\n",
+    "    if devices > 1:\n",
+    "        strategy = FSDPStrategy(\n",
+    "            auto_wrap_policy={Block},\n",
+    "            activation_checkpointing_policy={Block},\n",
+    "            state_dict_type=\"full\",\n",
+    "            limit_all_gathers=True,\n",
+    "            cpu_offload=False,\n",
+    "        )\n",
+    "    else:\n",
+    "        strategy = \"auto\"\n",
+    "\n",
+    "    fabric = L.Fabric(\n",
+    "        devices=devices, strategy=strategy, precision=precision, loggers=logger\n",
+    "    )\n",
+    "    fabric.print(hparams)\n",
+    "    fabric.launch(main, train_data_dir, val_data_dir, resume)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_copy = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main(\n",
+    "    fabric: L.Fabric,\n",
+    "    train_data_dir: Path,\n",
+    "    val_data_dir: Path,\n",
+    "    resume: Union[bool, Path],\n",
+    ") -> None:\n",
+    "    global model_copy\n",
+    "    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit=\"seconds\")\n",
+    "\n",
+    "    if fabric.global_rank == 0:\n",
+    "        out_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    config = Config.from_name(model_name)\n",
+    "\n",
+    "    train_dataloader, val_dataloader = create_dataloaders(\n",
+    "        batch_size=micro_batch_size,\n",
+    "        block_size=config.block_size,\n",
+    "        fabric=fabric,\n",
+    "        train_data_dir=train_data_dir,\n",
+    "        val_data_dir=val_data_dir,\n",
+    "        seed=(1337 + fabric.global_rank),\n",
+    "    )\n",
+    "    if val_dataloader is None:\n",
+    "        train_dataloader = fabric.setup_dataloaders(train_dataloader)\n",
+    "    else:\n",
+    "        train_dataloader, val_dataloader = fabric.setup_dataloaders(\n",
+    "            train_dataloader, val_dataloader\n",
+    "        )\n",
+    "\n",
+    "    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)\n",
+    "\n",
+    "    fabric.print(f\"Loading model with {config.__dict__}\")\n",
+    "    t0 = time.perf_counter()\n",
+    "    import torch\n",
+    "    import torch.nn as nn\n",
+    "\n",
+    "    def _init_weights(module: nn.Module) -> None:\n",
+    "        \"\"\"Meant to be used with `gpt.apply(gpt._init_weights)`.\"\"\"\n",
+    "        if isinstance(module, nn.Linear):\n",
+    "            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
+    "            if module.bias is not None:\n",
+    "                torch.nn.init.zeros_(module.bias)\n",
+    "        elif isinstance(module, nn.Embedding):\n",
+    "            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
+    "\n",
+    "    with fabric.init_module(empty_init=True):\n",
+    "        model = GPT(config)\n",
+    "        model.apply(_init_weights)\n",
+    "    model.apply(_init_weights)\n",
+    "\n",
+    "    # checkpoint_path = Path(\"out/redpajama/iter-000999-ckpt.pth\")\n",
+    "\n",
+    "    # load_checkpoint(fabric, model, checkpoint_path)\n",
+    "\n",
+    "    # print(model.transformer.h[0].mlp.fc.weight)\n",
+    "\n",
+    "    fabric.print(f\"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.\")\n",
+    "    fabric.print(f\"Total parameters {num_parameters(model):,}\")\n",
+    "\n",
+    "    model = fabric.setup(model)\n",
+    "    optimizer = torch.optim.AdamW(\n",
+    "        model.parameters(),\n",
+    "        lr=learning_rate,\n",
+    "        weight_decay=weight_decay,\n",
+    "        betas=(beta1, beta2),\n",
+    "        foreach=False,\n",
+    "    )\n",
+    "\n",
+    "    # model_copy = model\n",
+    "\n",
+    "    optimizer = fabric.setup_optimizers(optimizer)\n",
+    "\n",
+    "    state = {\n",
+    "        \"model\": model,\n",
+    "        \"optimizer\": optimizer,\n",
+    "        \"hparams\": hparams,\n",
+    "        \"iter_num\": 0,\n",
+    "        \"step_count\": 0,\n",
+    "    }\n",
+    "\n",
+    "    if resume is True:\n",
+    "        resume = max(out_dir.glob(\"*.pth\"), key=lambda p: int(p.name.split(\"-\")[1]))\n",
+    "    if resume:\n",
+    "        fabric.print(f\"Resuming training from {resume}\")\n",
+    "        fabric.load(resume, state)\n",
+    "\n",
+    "    train_time = time.perf_counter()\n",
+    "    train(fabric, state, train_dataloader, val_dataloader, speed_monitor)\n",
+    "    fabric.print(f\"Training time: {(time.perf_counter()-train_time):.2f}s\")\n",
+    "    if fabric.device.type == \"cuda\":\n",
+    "        fabric.print(f\"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(\n",
+    "    fabric: L.Fabric,\n",
+    "    state: dict,\n",
+    "    train_dataloader: DataLoader,\n",
+    "    val_dataloader: DataLoader,\n",
+    "    speed_monitor: SpeedMonitorBase,\n",
+    ") -> None:\n",
+    "    model = state[\"model\"]\n",
+    "    optimizer = state[\"optimizer\"]\n",
+    "\n",
+    "    if val_dataloader is not None:\n",
+    "        validate(fabric, model, val_dataloader)  # sanity check\n",
+    "\n",
+    "    with torch.device(\"meta\"):\n",
+    "        meta_model = GPT(model.config)\n",
+    "        # \"estimated\" is not as precise as \"measured\". Estimated is optimistic but widely used in the wild.\n",
+    "        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,\n",
+    "        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead\n",
+    "        estimated_flops = estimate_flops(meta_model) * micro_batch_size\n",
+    "        fabric.print(\n",
+    "            f\"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}\"\n",
+    "        )\n",
+    "        x = torch.randint(0, 1, (micro_batch_size, model.max_seq_length))\n",
+    "        measured_flops = measure_flops(meta_model, x)\n",
+    "        fabric.print(\n",
+    "            f\"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}\"\n",
+    "        )\n",
+    "        del meta_model, x\n",
+    "\n",
+    "    total_lengths = 0\n",
+    "    total_t0 = time.perf_counter()\n",
+    "\n",
+    "    for state[\"iter_num\"], train_data in enumerate(train_dataloader, state[\"iter_num\"]):\n",
+    "        if state[\"iter_num\"] >= max_iters:\n",
+    "            checkpoint_path = out_dir / f\"iter-{state['iter_num']:06d}-ckpt.pth\"\n",
+    "            fabric.print(f\"Saving checkpoint to {str(checkpoint_path)!r}\")\n",
+    "            fabric.save(checkpoint_path, state)\n",
+    "            break\n",
+    "\n",
+    "        # determine and set the learning rate for this iteration\n",
+    "        lr = get_lr(state[\"iter_num\"]) if decay_lr else learning_rate\n",
+    "        for param_group in optimizer.param_groups:\n",
+    "            param_group[\"lr\"] = lr\n",
+    "\n",
+    "        iter_t0 = time.perf_counter()\n",
+    "\n",
+    "        input_ids = train_data[:, 0 : model.max_seq_length].contiguous()\n",
+    "        targets = train_data[:, 1 : model.max_seq_length + 1].contiguous()\n",
+    "\n",
+    "        is_accumulating = (state[\"iter_num\"] + 1) % gradient_accumulation_steps != 0\n",
+    "        with fabric.no_backward_sync(model, enabled=is_accumulating):\n",
+    "            logits = model(input_ids)\n",
+    "            loss = chunked_cross_entropy(logits, targets, chunk_size=0)\n",
+    "            fabric.backward(loss / gradient_accumulation_steps)\n",
+    "\n",
+    "        # return\n",
+    "\n",
+    "        if not is_accumulating:\n",
+    "            fabric.clip_gradients(model, optimizer, max_norm=grad_clip)\n",
+    "            optimizer.step()\n",
+    "            optimizer.zero_grad()\n",
+    "            state[\"step_count\"] += 1\n",
+    "\n",
+    "        t1 = time.perf_counter()\n",
+    "        total_lengths += input_ids.size(1)\n",
+    "        speed_monitor.on_train_batch_end(\n",
+    "            (state[\"iter_num\"] + 1) * micro_batch_size,\n",
+    "            t1 - total_t0,\n",
+    "            # this assumes that device FLOPs are the same and that all devices have the same batch size\n",
+    "            fabric.world_size,\n",
+    "            flops_per_batch=measured_flops,\n",
+    "            lengths=total_lengths,\n",
+    "        )\n",
+    "        if state[\"iter_num\"] % log_interval == 0:\n",
+    "            fabric.print(\n",
+    "                f\"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, LR: {lr:.6f}, iter time:\"\n",
+    "                f\" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}\"\n",
+    "            )\n",
+    "\n",
+    "        if (\n",
+    "            val_dataloader is not None\n",
+    "            and not is_accumulating\n",
+    "            and state[\"step_count\"] % eval_interval == 0\n",
+    "        ):\n",
+    "            t0 = time.perf_counter()\n",
+    "            val_loss = validate(fabric, model, val_dataloader)\n",
+    "            t1 = time.perf_counter() - t0\n",
+    "            speed_monitor.eval_end(t1)\n",
+    "            fabric.print(\n",
+    "                f\"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms\"\n",
+    "            )\n",
+    "            fabric.barrier()\n",
+    "        if not is_accumulating and state[\"step_count\"] % save_interval == 0:\n",
+    "            checkpoint_path = out_dir / f\"iter-{state['iter_num']:06d}-ckpt.pth\"\n",
+    "            fabric.print(f\"Saving checkpoint to {str(checkpoint_path)!r}\")\n",
+    "            fabric.save(checkpoint_path, state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@torch.inference_mode()\n",
+    "def validate(\n",
+    "    fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader\n",
+    ") -> torch.Tensor:\n",
+    "    fabric.print(\"Validating ...\")\n",
+    "    model.eval()\n",
+    "\n",
+    "    losses = torch.zeros(eval_iters, device=fabric.device)\n",
+    "    for k, val_data in enumerate(val_dataloader):\n",
+    "        input_ids = val_data[:, 0 : model.max_seq_length].contiguous()\n",
+    "        targets = val_data[:, 1 : model.max_seq_length + 1].contiguous()\n",
+    "        logits = model(input_ids)\n",
+    "        losses[k] = chunked_cross_entropy(logits, targets, chunk_size=0)\n",
+    "    out = losses.mean()\n",
+    "\n",
+    "    model.train()\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_dataloader(\n",
+    "    batch_size: int,\n",
+    "    block_size: int,\n",
+    "    data_dir: Path,\n",
+    "    fabric: L.Fabric,\n",
+    "    shuffle: bool = True,\n",
+    "    seed: int = 12345,\n",
+    ") -> DataLoader:\n",
+    "    datasets = []\n",
+    "    for prefix, _ in data_config:\n",
+    "        filenames = glob.glob(str(data_dir / f\"{prefix}*\"))\n",
+    "        dataset = PackedDataset(\n",
+    "            filenames,\n",
+    "            n_chunks=4,\n",
+    "            block_size=block_size,\n",
+    "            shuffle=shuffle,\n",
+    "            seed=seed,\n",
+    "            num_processes=fabric.world_size,\n",
+    "            process_rank=fabric.global_rank,\n",
+    "        )\n",
+    "        datasets.append(dataset)\n",
+    "\n",
+    "    if not datasets:\n",
+    "        raise RuntimeError(\n",
+    "            f\"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset.\"\n",
+    "        )\n",
+    "\n",
+    "    weights = [weight for _, weight in data_config]\n",
+    "    sum_weights = sum(weights)\n",
+    "    weights = [el / sum_weights for el in weights]\n",
+    "\n",
+    "    combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights)\n",
+    "\n",
+    "    return DataLoader(\n",
+    "        combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_dataloaders(\n",
+    "    batch_size: int,\n",
+    "    block_size: int,\n",
+    "    fabric: L.Fabric,\n",
+    "    train_data_dir: Path = Path(\"data/redpajama_sample\"),\n",
+    "    val_data_dir: Optional[Path] = None,\n",
+    "    seed: int = 12345,\n",
+    ") -> Tuple[DataLoader, DataLoader]:\n",
+    "    # Increase by one because we need the next word as well\n",
+    "    effective_block_size = block_size + 1\n",
+    "    train_dataloader = create_dataloader(\n",
+    "        batch_size=batch_size,\n",
+    "        block_size=effective_block_size,\n",
+    "        fabric=fabric,\n",
+    "        data_dir=train_data_dir,\n",
+    "        shuffle=True,\n",
+    "        seed=seed,\n",
+    "    )\n",
+    "    val_dataloader = (\n",
+    "        create_dataloader(\n",
+    "            batch_size=batch_size,\n",
+    "            block_size=effective_block_size,\n",
+    "            fabric=fabric,\n",
+    "            data_dir=val_data_dir,\n",
+    "            shuffle=False,\n",
+    "            seed=seed,\n",
+    "        )\n",
+    "        if val_data_dir\n",
+    "        else None\n",
+    "    )\n",
+    "    return train_dataloader, val_dataloader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_lr(it: int) -> float:\n",
+    "    # 1) linear warmup for warmup_iters steps\n",
+    "    if it < warmup_iters:\n",
+    "        return learning_rate * it / warmup_iters\n",
+    "    # 2) if it > lr_decay_iters, return min learning rate\n",
+    "    if it > lr_decay_iters:\n",
+    "        return min_lr\n",
+    "    # 3) in between, use cosine decay down to min learning rate\n",
+    "    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
+    "    assert 0 <= decay_ratio <= 1\n",
+    "    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1\n",
+    "    return min_lr + coeff * (learning_rate - min_lr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# torch.set_float32_matmul_precision(\"medium\")\n",
+    "# setup(devices=1, train_data_dir=Path(\"data/lit-redpajama-sample\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from generate.base import main\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
+      "Time to instantiate model: 0.17 seconds.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time to load the model weights: 0.50 seconds.\n",
+      "Seed set to 1234\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Earth is a planet with rocky core and 100,000 hectares of natural Earth. Our planet is a planet with rocky core and 100,000 hectares of natural Earth. The sun has a warm, warm surface and the sun has a\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time for inference 1: 0.71 sec total, 70.90 tokens/sec\n",
+      "Memory used: 0.35 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "torch.set_float32_matmul_precision(\"high\")\n",
+    "main(\n",
+    "    prompt=\"Earth is a planet with rocky core and \",\n",
+    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
+      "Time to instantiate model: 0.02 seconds.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time to load the model weights: 0.49 seconds.\n",
+      "Seed set to 1234\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I like to drive when it is raining outside and 100% of the time. The next day, I think you will see the right movement.\n",
+      "We already know that if you don't go to the center, you can be a hug, or a bit more vigor.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time for inference 1: 0.69 sec total, 72.80 tokens/sec\n",
+      "Memory used: 0.35 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "main(\n",
+    "    prompt=\"I like to drive when it is raining outside and \",\n",
+    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
+      "Time to instantiate model: 0.02 seconds.\n",
+      "Time to load the model weights: 0.51 seconds.\n",
+      "Seed set to 1234\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I like to drive when it is raining outside and 100% of the time. The next day, I think you will see the right movement.\n",
+      "We already know that if you don't go to the center, you can be a hug, or a bit more vigor.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time for inference 1: 0.65 sec total, 76.96 tokens/sec\n",
+      "Memory used: 0.35 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "main(\n",
+    "    prompt=\"I like to drive when it is raining outside and \",\n",
+    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
+      "Time to instantiate model: 0.02 seconds.\n",
+      "Time to load the model weights: 0.49 seconds.\n",
+      "Seed set to 1234\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "What a beautiful day it was, never imagined I would be able to 100,000 times a month. It was the beginning of a carpet, and was about 15 minutes to drain from the carpet. We were so overwhelmed, ready to do the kits,\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time for inference 1: 0.68 sec total, 73.18 tokens/sec\n",
+      "Memory used: 0.35 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "main(\n",
+    "    prompt=\"What a beautiful day it was, never imagined I would be able to \",\n",
+    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading model 'out/redpajama/lit_model.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
+      "Time to instantiate model: 0.02 seconds.\n",
+      "Time to load the model weights: 0.49 seconds.\n",
+      "Seed set to 1234\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Do you think Einstein was the greatest ever physicist ever lived? I think 1 of the 1980s wrote a very deep, poetic narration of my life. I know all of you and your life is beautiful, especially in the sense of storytelling. You are. I know all of you\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time for inference 1: 0.68 sec total, 74.07 tokens/sec\n",
+      "Memory used: 0.35 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "main(\n",
+    "    prompt=\"Do you think Einstein was the greatest ever physicist ever lived? I think \",\n",
+    "    checkpoint_dir=Path(\"out/redpajama\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "max_length": 4096,
+  "pad_token_id": 0,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.32.0.dev0"
+}

lit_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"name": "Llama-2-7b-chat-hf", "hf_config": {"org": "meta-llama", "name": "Llama-2-7b-chat-hf"}, "block_size": 4096, "vocab_size": 32000, "padding_multiple": 64, "padded_vocab_size": 32000, "n_layer": 32, "n_head": 32, "n_embd": 4096, "rotary_percentage": 1.0, "parallel_residual": false, "bias": false, "lm_head_bias": false, "n_query_groups": 32, "shared_attention_norm": false, "_norm_class": "RMSNorm", "norm_eps": 1e-05, "_mlp_class": "LLaMAMLP", "gelu_approximate": "none", "intermediate_size": 11008, "rope_condense_ratio": 1, "rope_base": 10000}

lit_gpt/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import re
+import logging
+from lit_gpt.model import GPT
+from lit_gpt.config import Config
+from lit_gpt.tokenizer import Tokenizer
+from lightning_utilities.core.imports import RequirementCache
+_LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.2.0.dev0")
+if not bool(_LIGHTNING_AVAILABLE):
+    raise ImportError(
+        "Lit-GPT requires lightning nightly. Please run:\n"
+        f" pip uninstall -y lightning; pip install -r requirements.txt\n{str(_LIGHTNING_AVAILABLE)}"
+    )
+# Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632
+pattern = re.compile(".*Profiler function .* will be ignored")
+logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage()))
+__all__ = ["GPT", "Config", "Tokenizer"]

lit_gpt/adapter.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""Implementation of the paper:
+LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention
+https://arxiv.org/abs/2303.16199
+Port for Lit-GPT
+"""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from typing_extensions import Self
+from lit_gpt.config import Config as BaseConfig
+from lit_gpt.model import GPT as BaseModel
+from lit_gpt.model import Block as BaseBlock
+from lit_gpt.model import CausalSelfAttention as BaseCausalSelfAttention
+@dataclass
+class Config(BaseConfig):
+    adapter_prompt_length: int = 10
+    adapter_start_layer: int = 2
+class GPT(BaseModel):
+    """The implementation is identical to `lit_gpt.model.GPT` with the exception that
+    the `Block` saves the layer index and passes it down to the attention layer."""
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        assert config.padded_vocab_size is not None
+        self.config = config
+        self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias)
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)),
+                ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
+            )
+        )
+        self.max_seq_length = self.config.block_size
+        self.mask_cache: Optional[torch.Tensor] = None
+    def forward(
+        self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        T = idx.size(1)
+        if self.max_seq_length < T:
+            raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.")
+        if input_pos is not None:  # use the kv cache
+            cos = self.cos.index_select(0, input_pos)
+            sin = self.sin.index_select(0, input_pos)
+            if self.mask_cache is None:
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            mask = self.mask_cache.index_select(2, input_pos)
+        else:
+            cos = self.cos[:T]
+            sin = self.sin[:T]
+            mask = None
+        x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        for block in self.transformer.h:
+            x = block(x, cos, sin, mask, input_pos)
+        x = self.transformer.ln_f(x)
+        if lm_head_chunk_size > 0:
+            # chunk the lm head logits to reduce the peak memory used by autograd
+            return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)]
+        return self.lm_head(x)  # (b, t, vocab_size)
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        return cls(Config.from_name(name, **kwargs))
+    def _init_weights(self, module: nn.Module) -> None:
+        """Meant to be used with `gpt.apply(gpt._init_weights)`. Unused method left for completeness."""
+        super()._init_weights(module)
+        if isinstance(module, CausalSelfAttention):
+            module.reset_parameters()
+class Block(BaseBlock):
+    """The implementation is identical to `lit_gpt.model.Block` with the exception that
+    we replace the attention layer where adaption is implemented."""
+    def __init__(self, config: Config, block_idx: int) -> None:
+        # Skip the parent class __init__ altogether and replace it to avoid useless allocations
+        nn.Module.__init__(self)
+        self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.attn = CausalSelfAttention(config, block_idx)
+        if not config.shared_attention_norm:
+            self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.mlp = config.mlp_class(config)
+        self.config = config
+class CausalSelfAttention(BaseCausalSelfAttention):
+    """A modification of `lit_gpt.model.CausalSelfAttention` that adds the attention
+    over the adaption prompt."""
+    def __init__(self, config: Config, block_idx: int) -> None:
+        super().__init__(config)
+        if block_idx >= config.adapter_start_layer:
+            # adapter embedding layer
+            self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd)
+            # gate for adaption
+            self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1))
+            # kv cache for inference
+            self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+        self.block_idx = block_idx
+    def scaled_dot_product_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        y = super().scaled_dot_product_attention(q, k, v, mask)
+        if self.block_idx < self.config.adapter_start_layer:
+            return y
+        aT = self.config.adapter_prompt_length
+        if self.adapter_kv_cache is not None:
+            # since this uses the wte weights as the prefix and the kv cache is only used during inference, ak and av
+            # are the same every call
+            ak, av = self.adapter_kv_cache
+        else:
+            prefix = self.adapter_wte.weight.reshape(1, aT, self.config.n_embd)
+            aqkv = self.attn(prefix)
+            q_per_kv = self.config.n_head // self.config.n_query_groups
+            aqkv = aqkv.view(1, aT, self.config.n_query_groups, q_per_kv + 2, self.config.head_size)
+            aqkv = aqkv.permute(0, 2, 3, 1, 4)
+            _, ak, av = aqkv.split((q_per_kv, 1, 1), dim=2)
+            if self.config.n_query_groups != 1:
+                # for MHA this is a no-op
+                ak = ak.repeat_interleave(q_per_kv, dim=2)
+                av = av.repeat_interleave(q_per_kv, dim=2)
+            ak = ak.view(1, -1, aT, self.config.head_size)  # (1, nh_ak, aT, hs)
+            av = av.view(1, -1, aT, self.config.head_size)  # (1, nh_av, aT, hs)
+            self.adapter_kv_cache = (ak, av)
+        T = q.size(2)
+        amask = torch.ones(T, aT, dtype=torch.bool, device=q.device)
+        ay = super().scaled_dot_product_attention(q, ak, av, amask)
+        return y + self.gating_factor * ay
+    def reset_parameters(self) -> None:
+        torch.nn.init.zeros_(self.gating_factor)
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with older checkpoints."""
+        if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head:
+            state_dict[key] = state_dict[key].permute(0, 2, 1, 3)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+def mark_only_adapter_as_trainable(model: GPT) -> None:
+    """Sets `requires_grad=False` for all non-adapter weights."""
+    for name, param in model.named_parameters():
+        param.requires_grad = adapter_filter(name, param)
+def adapter_filter(key: str, value: Any) -> bool:
+    return "adapter_wte" in key or "gating_factor" in key

lit_gpt/adapter_v2.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""Implementation of the paper:
+LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model
+https://arxiv.org/abs/2304.15010
+Port for Lit-GPT
+"""
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Type
+import torch
+import torch.nn as nn
+from typing_extensions import Self
+import lit_gpt
+from lit_gpt.adapter import GPT as BaseModel
+from lit_gpt.adapter import Block as BaseBlock
+from lit_gpt.adapter import CausalSelfAttention as BaseCausalSelfAttention
+from lit_gpt.adapter import Config as BaseConfig
+from lit_gpt.model import KVCache
+from lit_gpt.utils import map_old_state_dict_weights
+@dataclass
+class Config(BaseConfig):
+    @property
+    def mlp_class(self) -> Type:
+        return getattr(lit_gpt.adapter_v2, self._mlp_class)
+def adapter_filter(key: str, value: Any) -> bool:
+    adapter_substrings = (
+        # regular adapter v1 parameters
+        "adapter_wte",
+        "gating_factor",
+        # adapter v2: new bias and scale used in Linear
+        "adapter_scale",
+        "adapter_bias",
+        # adapter v2: Norm parameters are now trainable
+        "norm_1",
+        "norm_2",
+        "ln_f",
+    )
+    return any(s in key for s in adapter_substrings)
+class AdapterV2Linear(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, **kwargs) -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(in_features, out_features, **kwargs)
+        self.adapter_bias = torch.nn.Parameter(torch.zeros(out_features), requires_grad=False)
+        self.adapter_scale = torch.nn.Parameter(torch.ones(out_features), requires_grad=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.adapter_scale * (self.linear(x) + self.adapter_bias)
+    def reset_parameters(self) -> None:
+        nn.init.zeros_(self.adapter_bias)
+        nn.init.ones_(self.adapter_scale)
+class GPT(BaseModel):
+    def __init__(self, config: Config) -> None:
+        # Skip the parent class __init__ altogether and replace it to avoid useless allocations
+        nn.Module.__init__(self)
+        assert config.padded_vocab_size is not None
+        self.config = config
+        self.lm_head = AdapterV2Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias)
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)),
+                ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
+            )
+        )
+        self.max_seq_length = self.config.block_size
+        self.mask_cache: Optional[torch.Tensor] = None
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        return cls(Config.from_name(name, **kwargs))
+    def _init_weights(self, module: nn.Module) -> None:
+        """Meant to be used with `gpt.apply(gpt._init_weights)`. Unused method left for completeness."""
+        super()._init_weights(module)
+        if isinstance(module, AdapterV2Linear):
+            module.reset_parameters()
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"}
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+class Block(BaseBlock):
+    """The implementation is identical to `lit_gpt.model.Block` with the exception that
+    we replace the attention layer where adaption is implemented."""
+    def __init__(self, config: Config, block_idx: int) -> None:
+        # Skip the parent class __init__ altogether and replace it to avoid useless allocations
+        nn.Module.__init__(self)
+        self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.attn = CausalSelfAttention(config, block_idx)
+        if not config.shared_attention_norm:
+            self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.mlp = config.mlp_class(config)
+        self.config = config
+class CausalSelfAttention(BaseCausalSelfAttention):
+    """A modification of `lit_gpt.adapter.CausalSelfAttention` that uses the Adapter V2 Linear class"""
+    def __init__(self, config: Config, block_idx: int) -> None:
+        # Skip the parent class __init__ altogether and replace it to avoid useless allocations
+        nn.Module.__init__(self)
+        shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
+        # key, query, value projections for all heads, but in a batch
+        self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias)
+        # output projection
+        self.proj = AdapterV2Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # disabled by default
+        self.kv_cache: Optional[KVCache] = None
+        if block_idx >= config.adapter_start_layer:
+            # adapter embedding layer
+            self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd)
+            # gate for adaption
+            self.gating_factor = torch.nn.Parameter(torch.zeros(1, 1, config.n_head, 1))
+            # kv cache for inference
+            self.adapter_kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+        self.block_idx = block_idx
+        self.config = config
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {
+            "attn.weight": "attn.linear.weight",
+            "attn.bias": "attn.linear.bias",
+            "proj.weight": "proj.linear.weight",
+            "proj.bias": "proj.linear.bias",
+        }
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        # For compatibility with older checkpoints
+        if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head:
+            state_dict[key] = state_dict[key].permute(0, 2, 1, 3)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+class GptNeoxMLP(lit_gpt.model.GptNeoxMLP):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        self.fc = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+        self.config = config
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {
+            "fc.weight": "fc.linear.weight",
+            "fc.bias": "fc.linear.bias",
+            "proj.weight": "proj.linear.weight",
+            "proj.bias": "proj.linear.bias",
+        }
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+class LLaMAMLP(lit_gpt.model.LLaMAMLP):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        self.fc_1 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.fc_2 = AdapterV2Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = AdapterV2Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {
+            "fc_1.weight": "fc_1.linear.weight",
+            "fc_1.bias": "fc_1.linear.bias",
+            "fc_2.weight": "fc_2.linear.weight",
+            "fc_2.bias": "fc_2.linear.bias",
+            "proj.weight": "proj.linear.weight",
+            "proj.bias": "proj.linear.bias",
+        }
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+def mark_only_adapter_v2_as_trainable(model: GPT) -> None:
+    """Sets requires_grad=False for all non-adapter weights"""
+    for name, param in model.named_parameters():
+        param.requires_grad = adapter_filter(name, param)

lit_gpt/config.py ADDED Viewed

	@@ -0,0 +1,1203 @@

+import json
+from copy import deepcopy
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal, Optional, Type, Union
+import torch
+from typing_extensions import Self
+import lit_gpt.model
+from lit_gpt.utils import find_multiple
+@dataclass
+class Config:
+    name: str = ""
+    hf_config: dict = field(default_factory=dict)
+    block_size: int = 4096
+    vocab_size: int = 50254
+    padding_multiple: int = 512
+    padded_vocab_size: Optional[int] = None
+    n_layer: int = 16
+    n_head: int = 32
+    n_embd: int = 4096
+    rotary_percentage: float = 0.25
+    parallel_residual: bool = True
+    bias: bool = True
+    lm_head_bias: bool = False
+    # to use multi-head attention (MHA), set this to `n_head` (default)
+    # to use multi-query attention (MQA), set this to 1
+    # to use grouped-query attention (GQA), set this to a value in between
+    # Example with `n_head=4`
+    # ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
+    # │ v ││ v ││ v ││ v │     │ v │    │ v │             │ v │
+    # └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
+    #   │    │    │    │         │        │                 │
+    # ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
+    # │ k ││ k ││ k ││ k │     │ k │    │ k │             │ k │
+    # └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
+    #   │    │    │    │      ┌──┴──┐  ┌──┴──┐      ┌────┬──┴─┬────┐
+    # ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐
+    # │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │
+    # └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘
+    # ◀──────────────────▶  ◀──────────────────▶  ◀──────────────────▶
+    #         MHA                    GQA                   MQA
+    #   n_query_groups=4       n_query_groups=2      n_query_groups=1
+    #
+    # credit https://arxiv.org/pdf/2305.13245.pdf
+    n_query_groups: Optional[int] = None
+    shared_attention_norm: bool = False
+    _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
+    norm_eps: float = 1e-5
+    _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
+    gelu_approximate: str = "none"
+    intermediate_size: Optional[int] = None
+    rope_condense_ratio: int = 1
+    rope_base: int = 10000
+    def __post_init__(self):
+        if not self.name:
+            self.name = self.hf_config.get("name", self.name)
+        assert self.n_embd % self.n_head == 0
+        self.head_size = self.n_embd // self.n_head
+        # vocab size should be a power of 2 to be optimal on hardware. compute the closest value
+        if self.padded_vocab_size is None:
+            self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple)
+        else:
+            # vocab size shouldn't be larger than padded vocab size
+            self.vocab_size = min(self.vocab_size, self.padded_vocab_size)
+        # compute the number of query groups
+        if self.n_query_groups is not None:
+            assert self.n_head % self.n_query_groups == 0
+        else:
+            self.n_query_groups = self.n_head
+        # compute the intermediate size for MLP if not set
+        if self.intermediate_size is None:
+            if self._mlp_class == "LLaMAMLP":
+                raise ValueError("The config needs to set the `intermediate_size`")
+            self.intermediate_size = 4 * self.n_embd
+        self.rope_n_elem = int(self.rotary_percentage * self.head_size)
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        if name not in name_to_config:
+            # search through all `config['hf_config']['name']`
+            try:
+                conf_dict = next(config for config in configs if name == config["hf_config"]["name"])
+            except StopIteration:
+                raise ValueError(f"{name!r} is not a supported config name")
+        else:
+            conf_dict = name_to_config[name]
+        conf_dict = conf_dict.copy()
+        if "condense_ratio" in kwargs:  # legacy name
+            kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
+        conf_dict.update(kwargs)
+        return cls(**conf_dict)
+    @classmethod
+    def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self:
+        with open(path, encoding="utf-8") as fp:
+            json_kwargs = json.load(fp)
+        if "condense_ratio" in json_kwargs:  # legacy name
+            json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio")
+        if "condense_ratio" in kwargs:  # legacy name
+            kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
+        if "org" in json_kwargs:  # legacy name
+            json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")}
+        if "org" in kwargs:  # legacy name
+            kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")}
+        json_kwargs.update(kwargs)
+        return cls(**json_kwargs)
+    @classmethod
+    def from_checkpoint(cls, path: Path, **kwargs: Any) -> Self:
+        """Automatically load `lit_config.json` and if it doesn't exist - a matching config from `lit_gpt/config.py`."""
+        if (config_path := path / "lit_config.json").is_file():
+            return cls.from_json(config_path, **kwargs)
+        if (model_name := path.name) in name_to_config:
+            return cls.from_name(model_name, **kwargs)
+        raise FileNotFoundError(f"For {str(path)!r} neither 'lit_config.json' nor matching config exists.")
+    @property
+    def mlp_class(self) -> Type:
+        # `self._mlp_class` cannot be the type to keep the config json serializable
+        return getattr(lit_gpt.model, self._mlp_class)
+    @property
+    def norm_class(self) -> Type:
+        # `self._norm_class` cannot be the type to keep the config json serializable
+        if self._norm_class == "RMSNorm":
+            from lit_gpt.rmsnorm import RMSNorm
+            return RMSNorm
+        return getattr(torch.nn, self._norm_class)
+########################
+# Stability AI StableLM
+########################
+configs = [
+    # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json
+    dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
+    # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
+    dict(
+        name="stablelm-base-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
+    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
+    dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
+    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
+    dict(
+        name="stablelm-tuned-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
+]
+####################
+# EleutherAI Pythia
+####################
+pythia = [
+    # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json
+    dict(
+        name="pythia-14m",
+        hf_config=dict(org="EleutherAI", name="pythia-14m"),
+        block_size=512,
+        n_layer=6,
+        n_embd=128,
+        n_head=4,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json
+    dict(
+        name="pythia-31m",
+        hf_config=dict(org="EleutherAI", name="pythia-31m"),
+        block_size=1024,
+        n_layer=6,
+        n_embd=256,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
+    dict(
+        name="pythia-70m",
+        hf_config=dict(org="EleutherAI", name="pythia-70m"),
+        block_size=2048,
+        n_layer=6,
+        n_embd=512,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
+    dict(
+        name="pythia-160m",
+        hf_config=dict(org="EleutherAI", name="pythia-160m"),
+        block_size=2048,
+        n_layer=12,
+        n_embd=768,
+        n_head=12,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
+    dict(
+        name="pythia-410m",
+        hf_config=dict(org="EleutherAI", name="pythia-410m"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=1024,
+        n_head=16,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
+    dict(
+        name="pythia-1b",
+        hf_config=dict(org="EleutherAI", name="pythia-1b"),
+        block_size=2048,
+        n_embd=2048,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
+    dict(
+        name="pythia-1.4b",
+        hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=2048,
+        n_head=16,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
+    dict(
+        name="pythia-2.8b",
+        hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
+    dict(
+        name="pythia-6.9b",
+        hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
+    dict(
+        name="pythia-12b",
+        hf_config=dict(org="EleutherAI", name="pythia-12b"),
+        block_size=2048,
+        n_layer=36,
+        n_embd=5120,
+        n_head=40,
+    ),
+]
+configs.extend(pythia)
+for c in pythia:
+    # "pythia-14m" and "pythia-31m" don't have deduped version
+    if c["name"] in ("pythia-14m", "pythia-31m"):
+        continue
+    copy = deepcopy(c)
+    copy["name"] = f"{c['name']}-deduped"
+    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
+    configs.append(copy)
+####################################
+# togethercomputer RedPajama INCITE
+####################################
+redpajama_incite = [
+    # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json
+    dict(
+        name="RedPajama-INCITE-{}-3B-v1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+    # https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json
+    dict(
+        name="RedPajama-INCITE-7B-{}",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+    # this redirects to the checkpoint above. kept for those who had the old weights already downloaded
+    dict(
+        name="RedPajama-INCITE-{}-7B-v0.1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+]
+for c in redpajama_incite:
+    for kind in ("Base", "Chat", "Instruct"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+#################
+# TII UAE Falcon
+#################
+falcon = [
+    # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
+    dict(
+        name="falcon-7b{}",
+        hf_config=dict(org="tiiuae", name="falcon-7b{}"),
+        block_size=2048,
+        vocab_size=65024,
+        padded_vocab_size=65024,
+        n_layer=32,
+        n_head=71,
+        n_embd=4544,
+        rotary_percentage=1.0,
+        n_query_groups=1,
+        bias=False,
+        # this is not in the config, but in the original model implementation, only for this config
+        shared_attention_norm=True,
+    ),
+    # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
+    dict(
+        name="falcon-40b{}",
+        hf_config=dict(org="tiiuae", name="falcon-40b{}"),
+        block_size=2048,
+        vocab_size=65024,
+        padded_vocab_size=65024,
+        n_layer=60,
+        n_head=128,
+        n_embd=8192,
+        rotary_percentage=1.0,
+        n_query_groups=8,
+        bias=False,
+    ),
+]
+for c in falcon:
+    for kind in ("", "-instruct"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
+falcon180b = dict(
+    name="falcon-180B{}",
+    hf_config=dict(org="tiiuae", name="falcon-180B{}"),
+    block_size=2048,
+    vocab_size=65024,
+    padded_vocab_size=65024,
+    n_layer=80,
+    n_head=232,
+    n_embd=14848,
+    rotary_percentage=1.0,
+    n_query_groups=8,
+    bias=False,
+)
+for kind in ("", "-chat"):
+    copy = deepcopy(falcon180b)
+    copy["name"] = falcon180b["name"].format(kind)
+    copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
+    configs.append(copy)
+#############################
+# OpenLM Research Open LLaMA
+#############################
+open_LLaMA = [
+    # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
+    dict(
+        name="open_llama_3b",
+        hf_config=dict(org="openlm-research", name="open_llama_3b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=26,
+        n_embd=3200,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=8640,
+    ),
+    # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
+    dict(
+        name="open_llama_7b",
+        hf_config=dict(org="openlm-research", name="open_llama_7b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
+    dict(
+        name="open_llama_13b",
+        hf_config=dict(org="openlm-research", name="open_llama_13b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+]
+configs.extend(open_LLaMA)
+###############
+# LMSYS Vicuna
+###############
+vicuna = [
+    # https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-33b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=60,
+        n_head=52,
+        n_embd=6656,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=17920,
+    ),
+    # https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=4,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_condense_ratio=4,
+    ),
+]
+configs.extend(vicuna)
+#################
+# LMSYS LongChat
+#################
+long_chat = [
+    # https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json
+    dict(
+        name="longchat-7b-16k",
+        hf_config=dict(org="lmsys", name="longchat-7b-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=8,
+    ),
+    # https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json
+    dict(
+        name="longchat-13b-16k",
+        hf_config=dict(org="lmsys", name="longchat-13b-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_condense_ratio=8,
+    ),
+]
+configs.extend(long_chat)
+######################
+# NousResearch Hermes
+######################
+nous_research = [
+    # https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json
+    dict(
+        name="Nous-Hermes-llama-2-7b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
+        padded_vocab_size=32000,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json
+    dict(
+        name="Nous-Hermes-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
+        block_size=2048,
+        vocab_size=32000,
+        padded_vocab_size=32001,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b
+    dict(
+        name="Nous-Hermes-Llama2-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
+        vocab_size=32000,
+        padded_vocab_size=32032,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+]
+configs.extend(nous_research)
+###############
+# Meta LLaMA 2
+###############
+llama_2 = [
+    # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-7b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-13b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-70b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+]
+for c in llama_2:
+    for kind in ("", "-chat"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+##########################
+# Stability AI FreeWilly2
+##########################
+freewilly_2 = [
+    # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
+    dict(
+        name="FreeWilly2",
+        hf_config=dict(org="stabilityai", name="FreeWilly2"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    )
+]
+configs.extend(freewilly_2)
+##################
+# Meta Code Llama
+##################
+code_llama = [
+    # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-7b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-7b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/tree/main/config.json
+    dict(
+        name="CodeLlama-7b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
+        block_size=2048,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+]
+configs.extend(code_llama)
+########################
+# garage-bAInd Platypus
+########################
+platypus = [
+    # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
+    dict(
+        name="Platypus-30B",
+        hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
+        block_size=2048,
+        padded_vocab_size=32000,
+        n_layer=60,
+        n_head=52,
+        n_embd=6656,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-06,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=17920,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
+    dict(
+        name="Platypus2-7B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
+        padded_vocab_size=32000,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
+    dict(
+        name="Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
+    dict(
+        name="Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+    # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
+    dict(
+        name="Camel-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
+    dict(
+        name="Camel-Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+    # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
+    dict(
+        name="Stable-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
+    dict(
+        name="Platypus2-70B-instruct",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+]
+configs.extend(platypus)
+##########################
+# Stability AI StableCode
+##########################
+stablecode = [
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
+        block_size=16384,
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b-4k",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-instruct-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+]
+configs.extend(stablecode)
+##################################
+# togethercomputer LLaMA-2-7B-32K
+##################################
+together_llama2_32k = [
+    # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
+    dict(
+        name="LLaMA-2-7B-32K",
+        hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=8,
+    )
+]
+configs.extend(together_llama2_32k)
+################
+# Microsoft Phi
+################
+phi = [
+    # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
+    dict(
+        name="phi-1_5",
+        hf_config=dict(org="microsoft", name="phi-1_5"),
+        vocab_size=50257,
+        padded_vocab_size=51200,
+        block_size=2048,
+        n_embd=2048,
+        n_layer=24,
+        rotary_percentage=0.5,  # 32 / (n_embd / n_head) = 32 / 64
+        shared_attention_norm=True,
+        lm_head_bias=True,
+        gelu_approximate="tanh",
+    )
+]
+configs.extend(phi)
+#############
+# Mistral AI
+#############
+mistral = [
+    # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+    dict(
+        name="Mistral-7B-{}v0.1",
+        hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
+        padded_vocab_size=32000,
+        block_size=4096,  # should be 32768 but sliding window attention is not implemented
+        n_layer=32,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=14336,
+    )
+]
+for c in mistral:
+    for kind in ("", "Instruct-"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+############
+# TinyLlama
+############
+tiny_llama = [
+    dict(
+        name="tiny-llama-1.1b{}",
+        hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=22,
+        n_head=32,
+        n_embd=2048,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",  # original TinyLlama uses FusedRMSNorm
+        norm_eps=1e-5,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=5632,
+        n_query_groups=4,
+    ),
+]
+for c in tiny_llama:
+    for kind, hf_postfix in (("", "-intermediate-step-955k-token-2T"), ("chat", "-Chat-v0.6")):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix)
+        configs.append(copy)
+name_to_config = {config["name"]: config for config in configs}

lit_gpt/lora.py ADDED Viewed

	@@ -0,0 +1,659 @@

+# Derived from https://github.com/microsoft/LoRA
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+r"""
+    Low Ranking Adaptation for LLMs scheme.
+             ┌───────────────────┐
+             ┆         h         ┆
+             └───────────────────┘
+                       ▲
+                       |
+                       +
+                    /     \
+    ┌─────────────────┐    ╭───────────────╮     Matrix initialization:
+    ┆                 ┆     \      B      /      B = 0
+    ┆   pretrained    ┆      \    r*d    /       A = N(0, sigma^2)
+    ┆    weights      ┆       ╰─────────╯
+    ┆                 ┆       |    r    |        r - rank
+    ┆   W e R^(d*d)   ┆       | ◀─────▶ |
+    ┆                 ┆       ╭─────────╮
+    └─────────────────┘      /     A     \
+              ▲             /     d*r     \
+               \           ╰───────────────╯
+                \                ▲
+                 \              /
+                  \            /
+             ┌───────────────────┐
+             ┆         x         ┆
+             └───────────────────┘
+With LoRA (Low Ranking Adaptation: https://arxiv.org/abs/2106.09685) instead of learning weights of size d*d,
+we can freeze the pretrained weights and instead learn two matrices of size d*r and r*d (they will store weight updates
+for the pretrained weights): the number of parameters in this case will be reduced drastically (depending on the rank of
+course) yet after multiplication of matrices d*r and r*d we will get a matrix d*d which we can sum with frozen
+pretrained weights and thus fine-tune the model.
+The goal of this approach is to move weight updates into a separate matrix which is decomposed with
+two matrices of a lower rank.
+"""
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing_extensions import Self
+import lit_gpt
+from lit_gpt.config import Config as BaseConfig
+from lit_gpt.model import GPT as BaseModel
+from lit_gpt.model import Block as BaseBlock
+from lit_gpt.model import CausalSelfAttention as BaseCausalSelfAttention
+from lit_gpt.model import KVCache
+from lit_gpt.utils import map_old_state_dict_weights
+class LoRALayer(nn.Module):
+    def __init__(self, r: int, lora_alpha: int, lora_dropout: float):
+        """Store LoRA specific attributes in a class.
+        Args:
+            r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of
+                the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2)
+            lora_alpha: alpha is needed for scaling updates as alpha/r
+                "This scaling helps to reduce the need to retune hyperparameters when we vary r"
+                https://arxiv.org/pdf/2106.09685.pdf (section 4.1)
+            lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A)
+        """
+        super().__init__()
+        assert r >= 0
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+class LoRALinear(LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        # ↓ this part is for pretrained weights
+        in_features: int,
+        out_features: int,
+        # ↓ the remaining part is for LoRA
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        **kwargs,
+    ):
+        """LoRA wrapper around linear class.
+        This class has three weight matrices:
+            1. Pretrained weights are stored as `self.linear.weight`
+            2. LoRA A matrix as `self.lora_A`
+            3. LoRA B matrix as `self.lora_B`
+        Only LoRA's A and B matrices are updated, pretrained weights stay frozen.
+        Args:
+            in_features: number of input features of the pretrained weights
+            out_features: number of output features of the pretrained weights
+            r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of
+                the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2)
+            lora_alpha: alpha is needed for scaling updates as alpha/r
+                "This scaling helps to reduce the need to retune hyperparameters when we vary r"
+                https://arxiv.org/pdf/2106.09685.pdf (section 4.1)
+            lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A)
+        """
+        super().__init__(r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout)
+        self.linear = torch.nn.Linear(in_features, out_features, **kwargs)
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(torch.zeros((r, in_features)))
+            self.lora_B = nn.Parameter(torch.zeros((out_features, r)))
+            self.scaling = self.lora_alpha / self.r
+            self.reset_parameters()
+    def reset_parameters(self) -> None:
+        """Reset all the weights, even including pretrained ones."""
+        if hasattr(self, "lora_A"):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            # Wondering why 'a' is equal to math.sqrt(5)?: https://github.com/pytorch/pytorch/issues/15314
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+    def merge(self) -> None:
+        """Merges the LoRA weights into the full-rank weights (W = W + delta_W)."""
+        if self.r > 0 and not self.merged:
+            # Merge the weights and mark it
+            self.linear.weight.data += (self.lora_B @ self.lora_A) * self.scaling
+            self.merged = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # if weights are merged or rank is less or equal to zero (LoRA is disabled) - it's only a regular nn.Linear forward pass;
+        # otherwise in addition do the forward pass with LoRA weights and add it's output to the output from pretrained weights
+        pretrained = self.linear(x)
+        if self.r == 0 or self.merged:
+            return pretrained
+        lora = (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
+        return pretrained + lora
+class LoRAQKVLinear(LoRALinear):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        # ↓ this part is for pretrained weights
+        in_features: int,
+        out_features: int,
+        # ↓ the remaining part is for LoRA
+        n_head: int,
+        n_query_groups: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        enable_lora: Union[bool, Tuple[bool, bool, bool]] = False,
+        **kwargs,
+    ):
+        """LoRA wrapper around linear class that is used for calculation of q, k and v matrices.
+        This class has three weight matrices:
+            1. Pretrained weights are stored as `self.linear.weight`
+            2. LoRA A matrix as `self.lora_A`
+            3. LoRA B matrix as `self.lora_B`
+        Only LoRA's A and B matrices are updated, pretrained weights stay frozen.
+        Args:
+            in_features: number of input features of the pretrained weights
+            out_features: number of output features of the pretrained weights
+            n_head: number of attention heads
+            n_query_groups: number of query groups (see diagram in `lit_gpt/config.py`)
+            r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of
+                the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2)
+            lora_alpha: alpha is needed for scaling updates as alpha/r
+                "This scaling helps to reduce the need to retune hyperparameters when we vary r"
+                https://arxiv.org/pdf/2106.09685.pdf (section 4.1)
+            lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A)
+            enable_lora: MergeLinear class is for attention mechanism where qkv are calculated with a single weight matrix. If we
+                don't want to apply LoRA we can set it as False. For example if we want to apply LoRA only to `query`
+                and `value` but keep `key` without weight updates we should pass `[True, False, True]`
+        """
+        super(LoRALinear, self).__init__(r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout)
+        self.linear = torch.nn.Linear(in_features, out_features, **kwargs)
+        self.n_head = n_head
+        self.n_query_groups = n_query_groups
+        if isinstance(enable_lora, bool):
+            enable_lora = [enable_lora] * 3
+        assert len(enable_lora) == 3
+        self.enable_lora = enable_lora
+        # Actual trainable parameters
+        # To better understand initialization let's imagine that we have such parameters:
+        # ⚬ in_features: 128 (embeddings_size)
+        # ⚬ out_features: 384 (3 * embedding_size)
+        # ⚬ r: 2
+        # ⚬ enable_lora: [True, False, True]
+        if r > 0 and any(enable_lora):
+            self.lora_A = nn.Parameter(torch.zeros((r * sum(enable_lora), in_features)))  # (4, 128)
+            enable_q, enable_k, enable_v = enable_lora
+            self.kv_embd_size = self.linear.in_features // (n_head // n_query_groups)
+            # qkv_shapes will be used to split a tensor with weights correctly
+            qkv_shapes = (
+                self.linear.in_features * enable_q,
+                self.kv_embd_size * enable_k,
+                self.kv_embd_size * enable_v,
+            )
+            self.qkv_shapes = [s for s in qkv_shapes if s]
+            self.lora_B = nn.Parameter(torch.zeros(sum(self.qkv_shapes), r))  # (256, 2))
+            # Notes about shapes above
+            # - self.lora_A has shape (4, 128): 4 because rank is 2 and LoRA is applied only to two matrices;
+            # 128 is the input size of the x (embedding size). (4, 128) and not (128, 4) because later on in
+            # F.linear function weights are automatically transposed. In addition conv1d requires channels to
+            # be before seq length
+            # - self.lora_B has shape (256, 2): 256 because LoRA is applied only to two matrices, so the output is
+            # 128*2; 2 tells to have two channels per group for group convolution
+            # Scaling:
+            # This balances the pretrained model`s knowledge and the new task-specific adaptation
+            # https://lightning.ai/pages/community/tutorial/lora-llm/
+            # So, set alpha to 1.0 to fully add LoRA. If the LoRA seems to have too much effect (i.e., overfitted), set
+            # alpha to lower value. If the LoRA seems to have too little effect, set alpha to higher than 1.0. You can
+            # tune these values to your needs. This value can be even slightly greater than 1.0!
+            # https://github.com/cloneofsimo/lora
+            self.scaling = self.lora_alpha / self.r
+            # Compute the indices
+            # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values,
+            # but not keys, then the weights update should be:
+            #
+            # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
+            #  [....................................],
+            #  [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
+            #      ↑              ↑            ↑
+            # ________________________________________
+            # | query         | key       | value    |
+            # ----------------------------------------
+            self.lora_ind = []
+            if enable_q:
+                self.lora_ind.extend(range(0, self.linear.in_features))
+            if enable_k:
+                self.lora_ind.extend(range(self.linear.in_features, self.linear.in_features + self.kv_embd_size))
+            if enable_v:
+                self.lora_ind.extend(range(self.linear.in_features + self.kv_embd_size, self.linear.out_features))
+            self.reset_parameters()
+    def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
+        """Properly pad weight updates with zeros.
+        If, based on `self.enable_lora`, we want to fine-tune queries and values, but not keys,
+        then the weights update should be:
+        [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
+         [....................................],
+         [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
+            ↑              ↑            ↑
+        ________________________________________
+        | query         | key       | value    |
+        ----------------------------------------
+        Args:
+            x: tensor with weights update that will be padded with zeros if necessary
+        Returns:
+            A tensor with weight updates and zeros for deselected q, k or v
+        """
+        # we need to do zero padding only if LoRA is disabled for one of QKV matrices
+        if all(self.enable_lora):
+            return x
+        # Let's image that:
+        # ⚬ input x has shape (64, 64, 256): (batch_size, sequence_length, embeddings_size)
+        # ⚬ embeddings_size: 128
+        # ⚬ self.linear.out_features: 384 (3 * embeddings_size)
+        # ⚬ enable_lora: [True, False, True]
+        # Then x has embeddings_size of 256 (2 * 128 as enable_lora only for query and value, not keys) and expected
+        # embeddings_size is 384 (self.linear.out_features), so that means that we need to pad from 256 to 384 with zeros, but
+        # only for key updates (this is where self.lora_ind comes in handy)
+        # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
+        # for example when we want to merge/unmerge LoRA weights and pretrained weights
+        x = x.transpose(0, 1)
+        result = x.new_zeros((*x.shape[:-1], self.linear.out_features))  # (64, 64, 384)
+        result = result.view(-1, self.linear.out_features)  # (4096, 384)
+        result = result.index_copy(
+            1, torch.tensor(self.lora_ind, device=result.device), x.reshape(-1, sum(self.qkv_shapes))
+        )  # (4096, 256)
+        return result.view((*x.shape[:-1], self.linear.out_features)).transpose(0, 1)  # (64, 64, 384)
+    def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        """An extension of the `torch.nn.functional.conv1d` function with a logic specific to grouped queries.
+        If the number of heads is equal to the number of query groups - grouped queries are disabled
+        (see scheme in `lit_gpt/config.py:Config`). In this case the combined QKV matrix consists of equally sized
+        query, key and value parts, which means we can utilize `groups` argument from `conv1d`: with this argument the
+        input and weight matrices will be splitted in equally sized parts and applied separately (like having multiple
+        conv layers side by side).
+        Otherwise QKV matrix consists of unequally sized parts and thus we have to split input and weight matrices manually,
+        apply each part of the weight matrix to the corresponding input's part and concatenate the result.
+        Args:
+            input: input matrix of shape (B, C, T)
+            weight: weight matrix of shape (C_output, rank, 1).
+                "C_output" is defined as a sum of embedding sizes for each enabled LoRA layer (see init method of the class).
+        Returns:
+            A tensor with a shape (B, C_output, T)
+        """
+        if self.n_head == self.n_query_groups:
+            return F.conv1d(input, weight, groups=sum(self.enable_lora))  # (B, C_output, T)
+        # Notation:
+        # ⚬ N: number of enabled LoRA layers (self.enable_lora)
+        # ⚬ C_output': embeddings size for each LoRA layer (not equal in size)
+        # ⚬ r: rank of all LoRA layers (equal in size)
+        input_splitted = input.chunk(sum(self.enable_lora), dim=1)  # N * (B, C // N, T)
+        weight_splitted = weight.split(self.qkv_shapes)  # N * (C_output', r, 1)
+        return torch.cat(
+            [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], dim=1  # (B, C_output', T)
+        )  # (B, C_output, T)
+    def merge(self) -> None:
+        """Merges the LoRA weights into the full-rank weights (W = W + delta_W)."""
+        # Let's assume that:
+        # ⚬ self.linear.weight.data: (384, 128) or (3 * embedding_size, embedding_size)
+        # ⚬ self.lora_A.data: (4, 128)
+        # ⚬ self.lora_B.data: (256, 2)
+        if self.r > 0 and any(self.enable_lora) and not self.merged:
+            delta_w = self.conv1d(
+                self.lora_A.data.unsqueeze(0),  # (4, 128) -> (1, 4, 128)
+                self.lora_B.data.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
+            ).squeeze(
+                0
+            )  # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128)
+            # W = W + delta_W (merge)
+            self.linear.weight.data += self.zero_pad(delta_w * self.scaling)  # (256, 128) after zero_pad (384, 128)
+            self.merged = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Do the forward pass.
+        If LoRA's weights are merged with pretrained ones then it's a simple matrix multiplication.
+        If not, then multiply pretrained weights with input, apply LoRA on input and do summation.
+        Args:
+            x: input tensor of shape (batch_size, context_length, embedding_size)
+        Returns:
+            Output tensor of shape (batch_size, context_length, 3 * embedding_size)
+        """
+        # Let's assume that:
+        # ⚬ x: (64, 64, 128) or (batch_size, context_length, embedding_size)
+        # ⚬ self.linear.weight: (384, 128) or (3 * embedding_size, embedding_size)
+        # ⚬ self.lora_A.data: (4, 128)
+        # ⚬ self.lora_B.data: (256, 2)
+        # if weights are merged or LoRA is disabled (r <= 0 or all `enable_lora` are False) - it's only a regular nn.Linear forward pass;
+        # otherwise in addition do the forward pass with LoRA weights and add it's output to the output from pretrained weights
+        pretrained = self.linear(x)
+        if self.r == 0 or not any(self.enable_lora) or self.merged:
+            return pretrained
+        after_A = F.linear(self.lora_dropout(x), self.lora_A)  # (64, 64, 128) @ (4, 128) -> (64, 64, 4)
+        # For F.conv1d:
+        # ⚬ input: input tensor of shape (mini-batch, in_channels, iW)
+        # ⚬ weight: filters of shape (out_channels, in_channels/groups, kW)
+        after_B = self.conv1d(
+            after_A.transpose(-2, -1),  # (64, 64, 4) -> (64, 4, 64)
+            self.lora_B.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
+        ).transpose(
+            -2, -1
+        )  # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64) -> (64, 64, 256)
+        lora = self.zero_pad(after_B) * self.scaling  # (64, 64, 256) after zero_pad (64, 64, 384)
+        return pretrained + lora
+def mark_only_lora_as_trainable(model: nn.Module, bias: str = "none") -> None:
+    """Freeze all modules except LoRA's and depending on 'bias' value unfreezes bias weights.
+    Args:
+        model: model with LoRA layers
+        bias:
+            ``"none"``: all bias weights will be frozen,
+            ``"lora_only"``: only bias weight for LoRA layers will be unfrozen,
+            ``"all"``: all bias weights will be unfrozen.
+    Raises:
+        NotImplementedError: if `bias` not in ["none", "lora_only", "all"]
+    """
+    # freeze all layers except LoRA's
+    for n, p in model.named_parameters():
+        if "lora_" not in n:
+            p.requires_grad = False
+    # depending on the `bias` value unfreeze bias weights
+    if bias == "none":
+        return
+    if bias == "all":
+        for n, p in model.named_parameters():
+            if "bias" in n:
+                p.requires_grad = True
+    elif bias == "lora_only":
+        for m in model.modules():
+            if isinstance(m, LoRALayer) and hasattr(m, "bias") and m.bias is not None:
+                m.bias.requires_grad = True
+    else:
+        raise NotImplementedError
+def lora_filter(key: str, value: Any) -> bool:
+    return "lora_" in key
+@dataclass
+class Config(BaseConfig):
+    """
+    Args:
+        r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of
+            the weights of the model. The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2)
+        alpha: alpha is needed for scaling updates as alpha/r
+            "This scaling helps to reduce the need to retune hyperparameters when we vary r"
+            https://arxiv.org/pdf/2106.09685.pdf (section 4.1)
+        dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A)
+        to_*: either apply LoRA to the specified weights or not
+    """
+    r: int = 0
+    alpha: int = 1
+    dropout: float = 0.0
+    to_query: bool = False
+    to_key: bool = False
+    to_value: bool = False
+    to_projection: bool = False
+    to_mlp: bool = False
+    to_head: bool = False
+    @property
+    def mlp_class(self) -> Type:
+        return getattr(lit_gpt.lora, self._mlp_class)
+class GPT(BaseModel):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        assert config.padded_vocab_size is not None
+        self.config = config
+        self.lm_head = LoRALinear(
+            config.n_embd,
+            config.padded_vocab_size,
+            bias=config.lm_head_bias,
+            r=(config.r if config.to_head else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config) for _ in range(config.n_layer)),
+                ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
+            )
+        )
+        self.max_seq_length = self.config.block_size
+        self.mask_cache: Optional[torch.Tensor] = None
+    def forward(
+        self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None, lm_head_chunk_size: int = 0
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        T = idx.size(1)
+        if self.max_seq_length < T:
+            raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.")
+        if input_pos is not None:  # use the kv cache
+            cos = self.cos.index_select(0, input_pos)
+            sin = self.sin.index_select(0, input_pos)
+            if self.mask_cache is None:
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            mask = self.mask_cache.index_select(2, input_pos)
+        else:
+            cos = self.cos[:T]
+            sin = self.sin[:T]
+            mask = None
+        x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        for block in self.transformer.h:
+            x = block(x, cos, sin, mask, input_pos)
+        x = self.transformer.ln_f(x)
+        if lm_head_chunk_size > 0:
+            # chunk the lm head logits to reduce the peak memory used by autograd
+            return [self.lm_head(x_i) for x_i in x.split(lm_head_chunk_size, dim=1)]
+        return self.lm_head(x)  # (B, T, vocab_size)
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        return cls(Config.from_name(name, **kwargs))
+    def _init_weights(self, module: nn.Module) -> None:
+        """Meant to be used with `gpt.apply(gpt._init_weights)`. Unused method left for completeness."""
+        super()._init_weights(module)
+        if isinstance(module, LoRALinear):
+            module.reset_parameters()
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"}
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+class Block(BaseBlock):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.attn = CausalSelfAttention(config)
+        if not config.shared_attention_norm:
+            self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.mlp = config.mlp_class(config)
+        self.config = config
+class CausalSelfAttention(BaseCausalSelfAttention):
+    def __init__(self, config: Config) -> None:
+        # Skip the parent class __init__ altogether and replace it to avoid
+        # useless allocations
+        nn.Module.__init__(self)
+        shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
+        # key, query, value projections for all heads, but in a batch
+        self.attn = LoRAQKVLinear(
+            in_features=config.n_embd,
+            out_features=shape,
+            r=config.r,
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+            enable_lora=(config.to_query, config.to_key, config.to_value),
+            bias=config.bias,
+            # for MQA/GQA support
+            n_head=config.n_head,
+            n_query_groups=config.n_query_groups,
+        )
+        # output projection
+        self.proj = LoRALinear(
+            config.n_embd,
+            config.n_embd,
+            bias=config.bias,
+            r=(config.r if config.to_projection else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+        # disabled by default
+        self.kv_cache: Optional[KVCache] = None
+        self.config = config
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {
+            "attn.weight": "attn.linear.weight",
+            "attn.bias": "attn.linear.bias",
+            "proj.weight": "proj.linear.weight",
+            "proj.bias": "proj.linear.bias",
+        }
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+class GptNeoxMLP(lit_gpt.model.GptNeoxMLP):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        self.fc = LoRALinear(
+            config.n_embd,
+            config.intermediate_size,
+            bias=config.bias,
+            r=(config.r if config.to_mlp else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+        self.proj = LoRALinear(
+            config.intermediate_size,
+            config.n_embd,
+            bias=config.bias,
+            r=(config.r if config.to_mlp else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+        self.config = config
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {
+            "fc.weight": "fc.linear.weight",
+            "fc.bias": "fc.linear.bias",
+            "proj.weight": "proj.linear.weight",
+            "proj.bias": "proj.linear.bias",
+        }
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+class LLaMAMLP(lit_gpt.model.LLaMAMLP):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        self.fc_1 = LoRALinear(
+            config.n_embd,
+            config.intermediate_size,
+            bias=config.bias,
+            r=(config.r if config.to_mlp else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+        self.fc_2 = LoRALinear(
+            config.n_embd,
+            config.intermediate_size,
+            bias=config.bias,
+            r=(config.r if config.to_mlp else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+        self.proj = LoRALinear(
+            config.intermediate_size,
+            config.n_embd,
+            bias=config.bias,
+            r=(config.r if config.to_mlp else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {
+            "fc_1.weight": "fc_1.linear.weight",
+            "fc_1.bias": "fc_1.linear.bias",
+            "fc_2.weight": "fc_2.linear.weight",
+            "fc_2.bias": "fc_2.linear.bias",
+            "proj.weight": "proj.linear.weight",
+            "proj.bias": "proj.linear.bias",
+        }
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+def merge_lora_weights(model: GPT) -> None:
+    """Merge LoRA weights into the full-rank weights to speed up inference."""
+    for module in model.modules():
+        if isinstance(module, LoRALinear):
+            module.merge()

lit_gpt/model.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""Full definition of a GPT NeoX Language Model, all of it in this single file.
+Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and
+https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model.
+"""
+import math
+from typing import Any, Optional, Tuple
+import torch
+import torch.nn as nn
+from typing_extensions import Self
+from lit_gpt.config import Config
+class GPT(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        assert config.padded_vocab_size is not None
+        self.config = config
+        self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias)
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config) for _ in range(config.n_layer)),
+                ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
+            )
+        )
+        self.max_seq_length = self.config.block_size
+        self.mask_cache: Optional[torch.Tensor] = None
+    @property
+    def max_seq_length(self) -> int:
+        return self._max_seq_length
+    @max_seq_length.setter
+    def max_seq_length(self, value: int) -> None:
+        """
+        When doing inference, the sequences used might be shorter than the model's context length.
+        This allows setting a smaller number to avoid allocating unused memory
+        """
+        if value > self.config.block_size:
+            raise ValueError(f"Cannot attend to {value}, block size is only {self.config.block_size}")
+        self._max_seq_length = value
+        if not hasattr(self, "cos"):
+            # first call
+            cos, sin = self.rope_cache()
+            self.register_buffer("cos", cos, persistent=False)
+            self.register_buffer("sin", sin, persistent=False)
+        elif value != self.cos.size(0):
+            # override
+            self.cos, self.sin = self.rope_cache(device=self.cos.device)
+        # the mask and kv cache size will get updated on `set_kv_cache`. we cannot update it here because we don't know
+        # if the kv cache is expected
+    def reset_parameters(self) -> None:
+        # Trigger resetting the rope-cache
+        self.max_seq_length = self.config.block_size
+    def _init_weights(self, module: nn.Module) -> None:
+        """Meant to be used with `gpt.apply(gpt._init_weights)`."""
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        T = idx.size(1)
+        if self.max_seq_length < T:
+            raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.")
+        if input_pos is not None:  # use the kv cache
+            cos = self.cos.index_select(0, input_pos)
+            sin = self.sin.index_select(0, input_pos)
+            if self.mask_cache is None:
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            mask = self.mask_cache.index_select(2, input_pos)
+        else:
+            cos = self.cos[:T]
+            sin = self.sin[:T]
+            mask = None
+        x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        for block in self.transformer.h:
+            x = block(x, cos, sin, mask, input_pos)
+        x = self.transformer.ln_f(x)
+        return self.lm_head(x)  # (b, t, vocab_size)
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        return cls(Config.from_name(name, **kwargs))
+    def rope_cache(self, device: Optional[torch.device] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        return build_rope_cache(
+            seq_len=self.max_seq_length,
+            n_elem=self.config.rope_n_elem,
+            device=device,
+            condense_ratio=self.config.rope_condense_ratio,
+            base=self.config.rope_base,
+        )
+    def set_kv_cache(
+        self,
+        batch_size: int,
+        rope_cache_length: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        if rope_cache_length is None:
+            rope_cache_length = self.cos.size(-1)
+        max_seq_length = self.max_seq_length
+        # initialize the kv cache for all blocks
+        for block in self.transformer.h:
+            block.attn.kv_cache = block.attn.build_kv_cache(
+                batch_size, max_seq_length, rope_cache_length, device, dtype
+            )
+        if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length:
+            # passing `attn_mask` to SDPA downgrades it to use the inefficient implementation. since we only need the mask
+            # for the kv-cache support (only during inference), we only create it in that situation
+            # this will be resolved by https://github.com/pytorch/pytorch/issues/96099
+            ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool)
+            self.mask_cache = torch.tril(ones).unsqueeze(0).unsqueeze(0)
+    def clear_kv_cache(self) -> None:
+        self.mask_cache = None
+        for block in self.transformer.h:
+            block.attn.kv_cache = None
+class Block(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.attn = CausalSelfAttention(config)
+        self.norm_2 = None if config.shared_attention_norm else config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.mlp = config.mlp_class(config)
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        n_1 = self.norm_1(x)
+        h = self.attn(n_1, cos, sin, mask, input_pos)
+        if self.config.parallel_residual:
+            n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
+            x = self.mlp(n_2) + h + x
+        else:
+            if self.config.shared_attention_norm:
+                raise NotImplementedError(
+                    "No checkpoint amongst the ones we support uses this configuration"
+                    " (non-parallel residual and shared attention norm)."
+                )
+            x = h + x
+            x = self.mlp(self.norm_2(x)) + x
+        return x
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
+        # key, query, value projections for all heads, but in a batch
+        self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # disabled by default
+        self.kv_cache: Optional[KVCache] = None
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        qkv = self.attn(x)
+        # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`)
+        q_per_kv = self.config.n_head // self.config.n_query_groups
+        total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
+        qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size)
+        qkv = qkv.permute(0, 2, 3, 1, 4)  # (B, n_query_groups, total_qkv, T, hs)
+        # split batched computation into three
+        q, k, v = qkv.split((q_per_kv, 1, 1), dim=2)
+        # maybe repeat k and v if for the non multi-head attention cases
+        # training: flash attention requires it
+        # inference: multi-query would require a full kv cache so avoid it to limit its memory usage
+        if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1):
+            k = k.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
+            v = v.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
+        q = q.reshape(B, -1, T, self.config.head_size)  # (B, nh_q, T, hs)
+        k = k.reshape(B, -1, T, self.config.head_size)  # (B, nh_k, T, hs)
+        v = v.reshape(B, -1, T, self.config.head_size)  # (B, nh_v, T, hs)
+        q_roped = apply_rope(q[..., : self.config.rope_n_elem], cos, sin)
+        k_roped = apply_rope(k[..., : self.config.rope_n_elem], cos, sin)
+        q = torch.cat((q_roped, q[..., self.config.rope_n_elem :]), dim=-1)
+        k = torch.cat((k_roped, k[..., self.config.rope_n_elem :]), dim=-1)
+        if input_pos is not None:
+            if not isinstance(self.kv_cache, KVCache):
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            k, v = self.kv_cache(input_pos, k, v)
+        y = self.scaled_dot_product_attention(q, k, v, mask)
+        y = y.reshape(B, T, C)  # re-assemble all head outputs side by side
+        # output projection
+        return self.proj(y)
+    def scaled_dot_product_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        scale = 1.0 / math.sqrt(self.config.head_size)
+        y = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None
+        )
+        return y.transpose(1, 2)
+    def build_kv_cache(
+        self,
+        batch_size: int,
+        max_seq_length: int,
+        rope_cache_length: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "KVCache":
+        heads = 1 if self.config.n_query_groups == 1 else self.config.n_head
+        v_shape = (batch_size, heads, max_seq_length, self.config.head_size)
+        if rope_cache_length is None:
+            if self.config.rotary_percentage != 1.0:
+                raise TypeError("Please pass the `rope_cache_length=gpt.cos.size(-1)` value")
+            k_shape = v_shape
+        else:
+            k_shape = (
+                batch_size,
+                heads,
+                max_seq_length,
+                rope_cache_length + self.config.head_size - self.config.rope_n_elem,
+            )
+        return KVCache(k_shape, v_shape, device=device, dtype=dtype)
+class GptNeoxMLP(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+        self.config = config
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc(x)
+        x = torch.nn.functional.gelu(x, approximate=self.config.gelu_approximate)
+        return self.proj(x)
+class LLaMAMLP(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.silu(x_fc_1) * x_fc_2
+        return self.proj(x)
+def build_rope_cache(
+    seq_len: int, n_elem: int, device: Optional[torch.device] = None, base: int = 10000, condense_ratio: int = 1
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Enhanced Transformer with Rotary Position Embedding.
+    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+    transformers/rope/__init__.py. MIT License:
+    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+    """
+    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+    # Create position indexes `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, device=device) / condense_ratio
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta).repeat(1, 2)
+    return torch.cos(idx_theta), torch.sin(idx_theta)
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    head_size = x.size(-1)
+    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
+    x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
+    rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
+    roped = (x * cos) + (rotated * sin)
+    return roped.type_as(x)
+class KVCache(nn.Module):
+    def __init__(
+        self,
+        k_shape: Tuple[int, int, int, int],
+        v_shape: Tuple[int, int, int, int],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+        self.register_buffer("k", torch.zeros(k_shape, device=device, dtype=dtype), persistent=False)
+        self.register_buffer("v", torch.zeros(v_shape, device=device, dtype=dtype), persistent=False)
+    def forward(self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # move the buffer to the activation dtype for when AMP is used
+        self.k = self.k.to(k.dtype)
+        self.v = self.v.to(v.dtype)
+        # update the cache
+        k = self.k.index_copy_(2, input_pos, k)
+        v = self.v.index_copy_(2, input_pos, v)
+        return k, v
+    def reset_parameters(self) -> None:
+        torch.nn.init.zeros_(self.k)
+        torch.nn.init.zeros_(self.v)

lit_gpt/packed_dataset.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# Very loosely inspired by indexed_dataset in Fairseq, Megatron
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py
+import os
+import random
+import struct
+import numpy as np
+import torch
+from torch.utils.data import IterableDataset, get_worker_info
+dtypes = {1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float32, 7: np.float64, 8: np.uint16}
+def code(dtype):
+    for k in dtypes:
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+HDR_MAGIC = b"LITPKDS"
+HDR_SIZE = 24  # bytes
+class PackedDataset(IterableDataset):
+    def __init__(
+        self, filenames, n_chunks, block_size, seed=12345, shuffle=True, wrap=False, num_processes=1, process_rank=0
+    ):
+        self._filenames = filenames
+        self._n_chunks = n_chunks
+        self._block_size = block_size
+        self._seed = seed
+        self._shuffle = shuffle
+        self._wrap = wrap
+        self._num_processes = num_processes
+        self._process_rank = process_rank
+    def __iter__(self):
+        worker_info = get_worker_info()
+        num_workers = worker_info.num_workers if worker_info is not None else 1
+        worker_id = worker_info.id if worker_info is not None else 0
+        num_shards = num_workers * self._num_processes
+        shard_id = self._process_rank * num_workers + worker_id
+        max_num_files = len(self._filenames) // num_shards * num_shards
+        filenames = self._filenames[shard_id:max_num_files:num_shards]
+        return PackedDatasetIterator(
+            filenames=filenames,
+            n_chunks=self._n_chunks,
+            block_size=self._block_size,
+            seed=self._seed,
+            shuffle=self._shuffle,
+            wrap=self._wrap,
+        )
+class PackedDatasetBuilder(object):
+    def __init__(self, outdir, prefix, chunk_size, sep_token, dtype="auto", vocab_size=None):
+        if dtype == "auto":
+            if vocab_size is None:
+                raise ValueError("vocab_size cannot be None when dtype='auto'")
+            if vocab_size is not None and vocab_size < 65500:
+                self._dtype = np.uint16
+            else:
+                self._dtype = np.int32
+        else:
+            self._dtype = dtype
+        self._counter = 0
+        self._chunk_size = chunk_size
+        self._outdir = outdir
+        self._prefix = prefix
+        self._sep_token = sep_token
+        self._arr = np.zeros(self._chunk_size, dtype=self._dtype)
+        self._arr.fill(self._sep_token)
+        self._idx = 0
+        self._version = 1
+        self._filenames = []
+    def _write_chunk(self):
+        filename = f"{self._prefix}_{self._counter:010d}.bin"
+        filename = os.path.join(self._outdir, filename)
+        with open(filename, "wb") as f:
+            f.write(HDR_MAGIC)
+            f.write(struct.pack("<Q", self._version))
+            f.write(struct.pack("<B", code(self._dtype)))
+            f.write(struct.pack("<Q", self._chunk_size))
+            f.write(self._arr.tobytes(order="C"))
+        self._filenames.append(filename)
+        self._counter += 1
+        self._arr.fill(self._sep_token)
+        self._idx = 0
+    @property
+    def dtype(self):
+        return self._dtype
+    @property
+    def filenames(self):
+        return self._filenames.copy()
+    def add_array(self, arr):
+        while self._idx + arr.shape[0] > self._chunk_size:
+            part_len = self._chunk_size - self._idx
+            self._arr[self._idx : self._idx + part_len] = arr[:part_len]
+            self._write_chunk()
+            arr = arr[part_len:]
+        arr_len = arr.shape[0]
+        self._arr[self._idx : self._idx + arr_len] = arr
+        self._idx += arr_len
+    def write_reminder(self):
+        self._write_chunk()
+class PackedDatasetIterator:
+    def __init__(self, filenames, n_chunks, block_size, seed, shuffle, wrap):
+        self._seed = seed
+        self._shuffle = shuffle
+        self._rng = np.random.default_rng(seed) if shuffle else None
+        self._block_idxs = None
+        self._wrap = wrap
+        # TODO: instead of filenames, we could have a single text stream
+        #       (or text file) with the sequence of all files to be
+        #       fetched/loaded.
+        self._filenames = filenames
+        self._file_idx = 0
+        self._n_chunks = n_chunks
+        self._dtype = None
+        self._block_size = block_size
+        self._n_blocks = None
+        self._mmaps = []
+        self._buffers = []
+        self._block_idxs = []
+        self._curr_idx = 0
+        self._load_n_chunks()
+    def _read_header(self, path):
+        with open(path, "rb") as f:
+            magic = f.read(len(HDR_MAGIC))
+            assert magic == HDR_MAGIC, "File doesn't match expected format."
+            version = struct.unpack("<Q", f.read(8))
+            assert version == (1,)
+            (dtype_code,) = struct.unpack("<B", f.read(1))
+            dtype = dtypes[dtype_code]
+            (chunk_size,) = struct.unpack("<Q", f.read(8))
+        return dtype, chunk_size
+    def _close_mmaps(self):
+        for mmap in self._mmaps:
+            mmap._mmap.close()
+    def _load_n_chunks(self):
+        self._close_mmaps()
+        self._mmaps = []
+        self._buffers = []
+        if self._n_chunks > len(self._filenames[self._file_idx :]):
+            if not self._wrap:
+                raise StopIteration
+            self._file_idx = 0
+        for i in range(self._n_chunks):
+            filename = self._filenames[self._file_idx + i]
+            if self._dtype is None:
+                self._dtype, self._chunk_size = self._read_header(filename)
+                self._n_blocks = self._chunk_size // self._block_size
+            # TODO: check header matches with previous files
+            mmap = np.memmap(filename, mode="r", order="C", offset=HDR_SIZE)
+            self._mmaps.append(mmap)
+            self._buffers.append(memoryview(mmap))
+        self._file_idx += self._n_chunks
+        n_all_blocks = self._n_chunks * self._n_blocks
+        self._block_idxs = self._rng.permutation(n_all_blocks) if self._shuffle else range(n_all_blocks)
+        self._curr_idx = 0
+    def __del__(self):
+        self._close_mmaps()
+        del self._mmaps
+        del self._buffers
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self._curr_idx >= len(self._block_idxs):
+            self._load_n_chunks()
+            # TODO: trigger fetching next next n_chunks if remote
+        block_idx = self._block_idxs[self._curr_idx]
+        chunk_id = block_idx // self._n_blocks
+        buffer = self._buffers[chunk_id]
+        elem_id = (block_idx % self._n_blocks) * self._block_size
+        offset = np.dtype(self._dtype).itemsize * elem_id
+        arr = np.frombuffer(buffer, dtype=self._dtype, count=self._block_size, offset=offset)
+        self._curr_idx += 1
+        return torch.from_numpy(arr.astype(np.int64))
+class CombinedDataset(IterableDataset):
+    def __init__(self, datasets, seed, weights=None):
+        self._seed = seed
+        self._datasets = datasets
+        self._weights = weights
+        n_datasets = len(datasets)
+        if weights is None:
+            self._weights = [1 / n_datasets] * n_datasets
+        else:
+            self._weights = [w / sum(weights) for w in weights]
+    def __iter__(self):
+        return CombinedDatasetIterator(self._datasets, self._seed, self._weights)
+class CombinedDatasetIterator:
+    def __init__(self, datasets, seed, weights):
+        self._datasets = [iter(el) for el in datasets]
+        self._weights = weights
+        self._rng = random.Random(seed)
+    def __next__(self):
+        (dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1)
+        return next(dataset)

lit_gpt/rmsnorm.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
+    https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
+    """
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(size))
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        dtype = x.dtype
+        x = x.float()
+        # NOTE: the original RMSNorm paper implementation is not equivalent
+        norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
+        x_normed = x * torch.rsqrt(norm_x + self.eps)
+        return (self.weight * x_normed).to(dtype=dtype)
+    def reset_parameters(self) -> None:
+        torch.nn.init.ones_(self.weight)

lit_gpt/tokenizer.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import json
+from pathlib import Path
+from typing import Optional, Union
+import torch
+class Tokenizer:
+    def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
+        checkpoint_dir = Path(checkpoint_dir)
+        if not checkpoint_dir.exists():
+            raise NotADirectoryError(f"The checkpoint directory does not exist: {str(checkpoint_dir)}")
+        self.use_bos = self.check_if_bos_token_used(checkpoint_dir)
+        self.bos_id = None
+        self.eos_id = None
+        # some checkpoints have both files, `.model` takes precedence
+        if (vocabulary_path := checkpoint_dir / "tokenizer.model").is_file():
+            from sentencepiece import SentencePieceProcessor
+            self.processor = SentencePieceProcessor(model_file=str(vocabulary_path))
+            self.backend = "sentencepiece"
+            self.bos_id = self.processor.bos_id()
+            self.eos_id = self.processor.eos_id()
+        elif (vocabulary_path := checkpoint_dir / "tokenizer.json").is_file():
+            from tokenizers import Tokenizer as HFTokenizer
+            self.processor = HFTokenizer.from_file(str(vocabulary_path))
+            self.backend = "huggingface"
+            if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
+                with open(special_tokens_path) as fp:
+                    config = json.load(fp)
+                bos_token = config.get("bos_token")
+                self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None
+                eos_token = config.get("eos_token")
+                self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None
+            if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
+                with open(special_tokens_path) as fp:
+                    config = json.load(fp)
+                if self.bos_id is None:
+                    self.bos_id = config.get("bos_token_id")
+                if self.eos_id is None:
+                    self.eos_id = config.get("eos_token_id")
+        else:
+            raise NotImplementedError
+    @property
+    def vocab_size(self) -> int:
+        if self.backend == "huggingface":
+            return self.processor.get_vocab_size(with_added_tokens=False)
+        if self.backend == "sentencepiece":
+            return self.processor.vocab_size()
+        raise RuntimeError
+    def token_to_id(self, token: str) -> int:
+        if self.backend == "huggingface":
+            id_ = self.processor.token_to_id(token)
+        elif self.backend == "sentencepiece":
+            id_ = self.processor.piece_to_id(token)
+        else:
+            raise RuntimeError
+        if id_ is None:
+            raise ValueError(f"token {token!r} not found in the collection.")
+        return id_
+    def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
+        if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file():
+            return False
+        with open(tokenizer_config_path) as fp:
+            config = json.load(fp)
+        if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
+            return True
+        # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True.
+        # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2
+        return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer"
+    def encode(
+        self,
+        string: str,
+        device: Optional[torch.device] = None,
+        bos: Optional[bool] = None,
+        eos: bool = False,
+        max_length: int = -1,
+    ) -> torch.Tensor:
+        if self.backend == "huggingface":
+            tokens = self.processor.encode(string).ids
+        elif self.backend == "sentencepiece":
+            tokens = self.processor.encode(string)
+        else:
+            raise RuntimeError
+        if bos or (bos is None and self.use_bos):
+            bos_id = self.bos_id
+            if bos_id is None:
+                raise NotImplementedError("This tokenizer does not have a defined a bos token")
+            tokens = [bos_id] + tokens
+        if eos:
+            tokens = tokens + [self.eos_id]
+        if max_length > 0:
+            tokens = tokens[:max_length]
+        return torch.tensor(tokens, dtype=torch.int, device=device)
+    def decode(self, tensor: torch.Tensor) -> str:
+        tokens = [tensor.item()] if tensor.ndim == 0 else tensor.tolist()
+        return self.processor.decode(tokens)

lit_gpt/utils.py ADDED Viewed

	@@ -0,0 +1,351 @@

+"""Utility functions for training and inference."""
+import math
+import pickle
+import sys
+from contextlib import nullcontext
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, ContextManager, Dict, List, Mapping, Optional, TypeVar, Union
+import lightning as L
+import torch
+import torch.nn as nn
+import torch.utils._device
+from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities.load import _lazy_load as lazy_load
+from torch.serialization import normalize_storage_type
+if TYPE_CHECKING:
+    from lit_gpt import GPT
+def find_multiple(n: int, k: int) -> int:
+    assert k > 0
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> int:
+    total = 0
+    for p in module.parameters():
+        if requires_grad is None or p.requires_grad == requires_grad:
+            if hasattr(p, "quant_state"):
+                # bitsandbytes 4bit layer support
+                total += math.prod(p.quant_state[1])
+            else:
+                total += p.numel()
+    return total
+def gptq_quantization(enabled: bool = False) -> ContextManager:
+    if not enabled:
+        return nullcontext()
+    from lightning.fabric.plugins.precision.utils import _ClassReplacementContextManager
+    from quantize.gptq import ColBlockQuantizedLinear
+    class QuantizedLinear(ColBlockQuantizedLinear):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, bits=4, tile_cols=-1, **kwargs)
+    return _ClassReplacementContextManager({"torch.nn.Linear": QuantizedLinear})
+def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None:
+    files = {
+        "lit_model.pth": (checkpoint_dir / "lit_model.pth").is_file(),
+        "lit_config.json": (checkpoint_dir / "lit_config.json").is_file(),
+        "tokenizer.json OR tokenizer.model": (checkpoint_dir / "tokenizer.json").is_file() or (
+            checkpoint_dir / "tokenizer.model"
+        ).is_file(),
+        "tokenizer_config.json": (checkpoint_dir / "tokenizer_config.json").is_file(),
+    }
+    if checkpoint_dir.is_dir():
+        if all(files.values()):
+            # we're good
+            return
+        problem = f" is missing the files: {[f for f, exists in files.items() if not exists]!r}"
+    else:
+        problem = " is not a checkpoint directory"
+    # list locally available checkpoints
+    available = list(Path("checkpoints").glob("*/*"))
+    if available:
+        options = "\n --checkpoint_dir ".join([""] + [repr(str(p.resolve())) for p in available])
+        extra = f"\nYou have downloaded locally:{options}\n"
+    else:
+        extra = ""
+    error_message = (
+        f"--checkpoint_dir {str(checkpoint_dir.absolute())!r}{problem}."
+        "\nFind download instructions at https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials\n"
+        f"{extra}\nSee all download options by running:\n python scripts/download.py"
+    )
+    print(error_message, file=sys.stderr)
+    raise SystemExit(1)
+class SavingProxyForStorage:
+    def __init__(self, obj, saver, protocol_version=5):
+        self.protocol_version = protocol_version
+        self.saver = saver
+        if not (isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj)):
+            raise TypeError(f"expected storage, not {type(obj)}")
+        # this logic is taken from PyTorch 2.0+ torch/serialization.py
+        if isinstance(obj, torch.storage.TypedStorage):
+            # PT upstream wants to deprecate this eventually...
+            storage = obj._untyped_storage
+            storage_type_str = obj._pickle_storage_type()
+            storage_type = getattr(torch, storage_type_str)
+            storage_numel = obj._size()
+        else:
+            storage = obj
+            storage_type = normalize_storage_type(type(obj))
+            storage_numel = storage.nbytes()
+        storage_key = saver._write_storage_and_return_key(storage)
+        location = torch.serialization.location_tag(storage)
+        self.storage_info = ("storage", storage_type, storage_key, location, storage_numel)
+    def __reduce_ex__(self, protocol_version):
+        assert False, "this should be handled with out of band"
+class SavingProxyForTensor:
+    def __init__(self, tensor, saver, protocol_version=5):
+        self.protocol_version = protocol_version
+        self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__(protocol_version)
+        if reduce_args[0] == torch._utils._rebuild_tensor_v2:
+            # for Tensors with Python attributes
+            (a0, a1, (storage, *a2_other), *other_reduce_args) = reduce_args
+            assert isinstance(storage, torch.storage.TypedStorage), "Please check for updates"
+            storage_proxy = SavingProxyForStorage(storage, saver, protocol_version=protocol_version)
+            self.reduce_args = (a0, a1, (storage_proxy, *a2_other), *other_reduce_args)
+        else:
+            (storage, *other_reduce_args) = reduce_args
+            assert isinstance(storage, torch.storage.TypedStorage), "Please check for updates"
+            storage_proxy = SavingProxyForStorage(storage, saver, protocol_version=protocol_version)
+            self.reduce_args = (storage_proxy, *other_reduce_args)
+    def __reduce_ex__(self, protocol_version):
+        if protocol_version != self.protocol_version:
+            raise RuntimeError(f"Unexpected protocol version: expected {self.protocol_version}, got {protocol_version}")
+        return self.reduce_ret_fn, self.reduce_args
+class IncrementalPyTorchPickler(pickle.Pickler):
+    def __init__(self, saver, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.storage_dtypes = {}
+        self.saver = saver
+        self.id_map = {}
+    # this logic is taken from PyTorch 2.0+ torch/serialization.py
+    def persistent_id(self, obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, SavingProxyForStorage):
+            return obj.storage_info
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage_numel = obj._size()
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                storage_numel = storage.nbytes()
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in self.storage_dtypes:
+                    if storage_dtype != self.storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that view the same data as different types"
+                        )
+                else:
+                    self.storage_dtypes[storage.data_ptr()] = storage_dtype
+            storage_key = self.id_map.get(storage._cdata)
+            if storage_key is None:
+                storage_key = self.saver._write_storage_and_return_key(storage)
+                self.id_map[storage._cdata] = storage_key
+            location = torch.serialization.location_tag(storage)
+            return ("storage", storage_type, storage_key, location, storage_numel)
+        return None
+class incremental_save:
+    def __init__(self, name):
+        self.name = name
+        self.zipfile = torch._C.PyTorchFileWriter(str(name))
+        self.has_saved = False
+        self.next_key = 0
+    def __enter__(self):
+        return self
+    def store_early(self, tensor):
+        if isinstance(tensor, torch.Tensor):
+            return SavingProxyForTensor(tensor, self)
+        raise TypeError(f"can only store tensors early, not {type(tensor)}")
+    def save(self, obj):
+        if self.has_saved:
+            raise RuntimeError("have already saved")
+        # Write the pickle data for `obj`
+        data_buf = BytesIO()
+        pickler = IncrementalPyTorchPickler(self, data_buf, protocol=5)
+        pickler.dump(obj)
+        data_value = data_buf.getvalue()
+        self.zipfile.write_record("data.pkl", data_value, len(data_value))
+        self.has_saved = True
+    def _write_storage_and_return_key(self, storage):
+        if self.has_saved:
+            raise RuntimeError("have already saved")
+        key = self.next_key
+        self.next_key += 1
+        name = f"data/{key}"
+        if storage.device.type != "cpu":
+            storage = storage.cpu()
+        num_bytes = storage.nbytes()
+        self.zipfile.write_record(name, storage.data_ptr(), num_bytes)
+        return key
+    def __exit__(self, type, value, traceback):
+        self.zipfile.write_end_of_file()
+T = TypeVar("T")
+def chunked_cross_entropy(
+    logits: Union[torch.Tensor, List[torch.Tensor]], targets: torch.Tensor, chunk_size: int = 128
+) -> torch.Tensor:
+    # with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate
+    # the memory usage in fine-tuning settings with low number of parameters.
+    # as a workaround hack, the cross entropy computation is chunked to force it to deallocate on the go, reducing
+    # the memory spike's magnitude
+    # lm_head was chunked (we are fine-tuning)
+    if isinstance(logits, list):
+        # don't want to chunk cross entropy
+        if chunk_size == 0:
+            logits = torch.cat(logits, dim=1)
+            logits = logits.reshape(-1, logits.size(-1))
+            targets = targets.reshape(-1)
+            return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)
+        # chunk cross entropy
+        logit_chunks = [logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits]
+        target_chunks = [target_chunk.reshape(-1) for target_chunk in targets.split(logits[0].size(1), dim=1)]
+        loss_chunks = [
+            torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=-1, reduction="none")
+            for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
+        ]
+        non_masked_elems = (targets != -1).sum()
+        mean_loss = torch.cat(loss_chunks).sum() / max(1, non_masked_elems)
+        return mean_loss
+    # no chunking at all
+    logits = logits.reshape(-1, logits.size(-1))
+    targets = targets.reshape(-1)
+    if chunk_size == 0:
+        return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)
+    # lm_head wasn't chunked, chunk cross entropy
+    logit_chunks = logits.split(chunk_size)
+    target_chunks = targets.split(chunk_size)
+    loss_chunks = [
+        torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=-1, reduction="none")
+        for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
+    ]
+    non_masked_elems = (targets != -1).sum()
+    mean_loss = torch.cat(loss_chunks).sum() / max(1, non_masked_elems)
+    return mean_loss
+def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict:
+    for checkpoint_name, attribute_name in mapping.items():
+        full_checkpoint_name = prefix + checkpoint_name
+        if full_checkpoint_name in state_dict:
+            full_attribute_name = prefix + attribute_name
+            state_dict[full_attribute_name] = state_dict.pop(full_checkpoint_name)
+    return state_dict
+def get_default_supported_precision(training: bool) -> str:
+    """Return default precision that is supported by the hardware: either `bf16` or `16`.
+    Args:
+        training: `-mixed` or `-true` version of the precision to use
+    Returns:
+        default precision that is suitable for the task and is supported by the hardware
+    """
+    from lightning.fabric.accelerators import MPSAccelerator
+    if MPSAccelerator.is_available() or (torch.cuda.is_available() and not torch.cuda.is_bf16_supported()):
+        return "16-mixed" if training else "16-true"
+    return "bf16-mixed" if training else "bf16-true"
+def load_checkpoint(fabric: L.Fabric, model: nn.Module, checkpoint_path: Path, strict: bool = True) -> None:
+    if isinstance(fabric.strategy, FSDPStrategy):
+        fabric.load_raw(checkpoint_path, model, strict=strict)
+    else:
+        state_dict = lazy_load(checkpoint_path)
+        state_dict = state_dict.get("model", state_dict)
+        model.load_state_dict(state_dict, strict=strict)
+def flops_per_param(max_seq_length: int, n_layer: int, n_embd: int, n_params: int) -> int:
+    flops_per_token = 2 * n_params  # each parameter is used for a MAC (2 FLOPS) per network operation
+    # this assumes that all samples have a fixed length equal to the block size
+    # which is most likely false during finetuning
+    flops_per_seq = flops_per_token * max_seq_length
+    attn_flops_per_seq = n_layer * 2 * 2 * (n_embd * (max_seq_length**2))
+    return flops_per_seq + attn_flops_per_seq
+def estimate_flops(model: "GPT", training: bool) -> int:
+    """Measures estimated FLOPs for MFU.
+    Refs:
+        * https://ar5iv.labs.arxiv.org/html/2205.05198#A1
+        * https://ar5iv.labs.arxiv.org/html/2204.02311#A2
+    """
+    # using all parameters for this is a naive over estimation because not all model parameters actually contribute to
+    # this FLOP computation (e.g. embedding, norm). For this reason, the result will be higher by a fixed percentage
+    # (~10%) compared to the measured FLOPs, making those lower but more realistic.
+    # For a proper estimate, this needs a more fine-grained calculation as in Appendix A of the paper.
+    n_trainable_params = num_parameters(model, requires_grad=True)
+    trainable_flops = flops_per_param(
+        model.max_seq_length, model.config.n_layer, model.config.n_embd, n_trainable_params
+    )
+    # forward + backward + gradients (assumes no gradient accumulation)
+    ops_per_step = 3 if training else 1
+    n_frozen_params = num_parameters(model, requires_grad=False)
+    frozen_flops = flops_per_param(model.max_seq_length, model.config.n_layer, model.config.n_embd, n_frozen_params)
+    # forward + backward
+    frozen_ops_per_step = 2 if training else 1
+    return ops_per_step * trainable_flops + frozen_ops_per_step * frozen_flops

main.ipynb ADDED Viewed

	@@ -0,0 +1,714 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "torch.cuda.is_available()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import math\n",
+    "import sys\n",
+    "import time\n",
+    "from pathlib import Path\n",
+    "from typing import Optional, Tuple, Union\n",
+    "\n",
+    "import lightning as L\n",
+    "import torch\n",
+    "from lightning.fabric.loggers import CSVLogger\n",
+    "from lightning.fabric.strategies import FSDPStrategy\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "# # support running without installing as a package\n",
+    "# wd = Path(__file__).parent.parent.resolve()\n",
+    "# sys.path.append(str(wd))\n",
+    "\n",
+    "from tsai_gpt.model import GPT, Block, Config\n",
+    "from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset\n",
+    "from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops\n",
+    "from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor\n",
+    "from tsai_gpt.utils import (\n",
+    "    chunked_cross_entropy,\n",
+    "    get_default_supported_precision,\n",
+    "    num_parameters,\n",
+    "    load_checkpoint,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"pythia-160m\"\n",
+    "name = \"redpajama\"\n",
+    "out_dir = Path(\"out\") / name\n",
+    "save_interval = 1000\n",
+    "eval_interval = 1000\n",
+    "eval_iters = 100\n",
+    "log_interval = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Hyperparameters\n",
+    "learning_rate = 6e-3\n",
+    "batch_size = 32\n",
+    "micro_batch_size = 8\n",
+    "gradient_accumulation_steps = batch_size // micro_batch_size\n",
+    "assert gradient_accumulation_steps > 0\n",
+    "# max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices\n",
+    "max_iters = 15000\n",
+    "weight_decay = 1e-1\n",
+    "beta1 = 0.9\n",
+    "beta2 = 0.95\n",
+    "grad_clip = 1.0\n",
+    "decay_lr = True\n",
+    "warmup_iters = 2000\n",
+    "lr_decay_iters = max_iters\n",
+    "min_lr = 6e-6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data proportions from https://arxiv.org/pdf/2302.13971.pdf Table 1\n",
+    "data_config = [\n",
+    "    (\"arxiv\", 2.5),\n",
+    "    (\"book\", 4.5),\n",
+    "    (\"c4\", 15.0),\n",
+    "    (\"cc\", 67.0),\n",
+    "    (\"github\", 4.5),\n",
+    "    (\"stackexchange\", 2.0),\n",
+    "    (\"wikipedia\", 4.5),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hparams = {\n",
+    "    k: v\n",
+    "    for k, v in locals().items()\n",
+    "    if isinstance(v, (int, float, str)) and not k.startswith(\"_\")\n",
+    "}\n",
+    "logger = CSVLogger(\"out\", name, flush_logs_every_n_steps=log_interval)\n",
+    "\n",
+    "\n",
+    "def setup(\n",
+    "    devices: int = 4,\n",
+    "    train_data_dir: Path = Path(\"data/redpajama_sample\"),\n",
+    "    val_data_dir: Optional[Path] = None,\n",
+    "    precision: Optional[str] = None,\n",
+    "    resume: Union[bool, Path] = False,\n",
+    ") -> None:\n",
+    "    precision = precision or get_default_supported_precision(training=True)\n",
+    "\n",
+    "    if devices > 1:\n",
+    "        strategy = FSDPStrategy(\n",
+    "            auto_wrap_policy={Block},\n",
+    "            activation_checkpointing_policy={Block},\n",
+    "            state_dict_type=\"full\",\n",
+    "            limit_all_gathers=True,\n",
+    "            cpu_offload=False,\n",
+    "        )\n",
+    "    else:\n",
+    "        strategy = \"auto\"\n",
+    "\n",
+    "    fabric = L.Fabric(\n",
+    "        devices=devices, strategy=strategy, precision=precision, loggers=logger\n",
+    "    )\n",
+    "    fabric.print(hparams)\n",
+    "    fabric.launch(main, train_data_dir, val_data_dir, resume)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_copy = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main(\n",
+    "    fabric: L.Fabric,\n",
+    "    train_data_dir: Path,\n",
+    "    val_data_dir: Path,\n",
+    "    resume: Union[bool, Path],\n",
+    ") -> None:\n",
+    "    global model_copy\n",
+    "    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit=\"seconds\")\n",
+    "\n",
+    "    if fabric.global_rank == 0:\n",
+    "        out_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    config = Config.from_name(model_name)\n",
+    "\n",
+    "    train_dataloader, val_dataloader = create_dataloaders(\n",
+    "        batch_size=micro_batch_size,\n",
+    "        block_size=config.block_size,\n",
+    "        fabric=fabric,\n",
+    "        train_data_dir=train_data_dir,\n",
+    "        val_data_dir=val_data_dir,\n",
+    "        seed=(1337 + fabric.global_rank),\n",
+    "    )\n",
+    "    if val_dataloader is None:\n",
+    "        train_dataloader = fabric.setup_dataloaders(train_dataloader)\n",
+    "    else:\n",
+    "        train_dataloader, val_dataloader = fabric.setup_dataloaders(\n",
+    "            train_dataloader, val_dataloader\n",
+    "        )\n",
+    "\n",
+    "    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)\n",
+    "\n",
+    "    fabric.print(f\"Loading model with {config.__dict__}\")\n",
+    "    t0 = time.perf_counter()\n",
+    "    import torch\n",
+    "    import torch.nn as nn\n",
+    "\n",
+    "    def _init_weights(module: nn.Module) -> None:\n",
+    "        \"\"\"Meant to be used with `gpt.apply(gpt._init_weights)`.\"\"\"\n",
+    "        if isinstance(module, nn.Linear):\n",
+    "            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
+    "            if module.bias is not None:\n",
+    "                torch.nn.init.zeros_(module.bias)\n",
+    "        elif isinstance(module, nn.Embedding):\n",
+    "            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
+    "\n",
+    "    with fabric.init_module(empty_init=True):\n",
+    "        model = GPT(config)\n",
+    "        model.apply(_init_weights)\n",
+    "    model.apply(_init_weights)\n",
+    "\n",
+    "    # checkpoint_path = Path(\"out/redpajama/iter-000999-ckpt.pth\")\n",
+    "\n",
+    "    # load_checkpoint(fabric, model, checkpoint_path)\n",
+    "\n",
+    "    # print(model.transformer.h[0].mlp.fc.weight)\n",
+    "\n",
+    "    fabric.print(f\"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.\")\n",
+    "    fabric.print(f\"Total parameters {num_parameters(model):,}\")\n",
+    "\n",
+    "    model = fabric.setup(model)\n",
+    "    optimizer = torch.optim.AdamW(\n",
+    "        model.parameters(),\n",
+    "        lr=learning_rate,\n",
+    "        weight_decay=weight_decay,\n",
+    "        betas=(beta1, beta2),\n",
+    "        foreach=False,\n",
+    "    )\n",
+    "\n",
+    "    # model_copy = model\n",
+    "\n",
+    "    optimizer = fabric.setup_optimizers(optimizer)\n",
+    "\n",
+    "    state = {\n",
+    "        \"model\": model,\n",
+    "        \"optimizer\": optimizer,\n",
+    "        \"hparams\": hparams,\n",
+    "        \"iter_num\": 0,\n",
+    "        \"step_count\": 0,\n",
+    "    }\n",
+    "\n",
+    "    if resume is True:\n",
+    "        resume = max(out_dir.glob(\"*.pth\"), key=lambda p: int(p.name.split(\"-\")[1]))\n",
+    "    if resume:\n",
+    "        fabric.print(f\"Resuming training from {resume}\")\n",
+    "        fabric.load(resume, state)\n",
+    "\n",
+    "    train_time = time.perf_counter()\n",
+    "    train(fabric, state, train_dataloader, val_dataloader, speed_monitor)\n",
+    "    fabric.print(f\"Training time: {(time.perf_counter()-train_time):.2f}s\")\n",
+    "    if fabric.device.type == \"cuda\":\n",
+    "        fabric.print(f\"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(\n",
+    "    fabric: L.Fabric,\n",
+    "    state: dict,\n",
+    "    train_dataloader: DataLoader,\n",
+    "    val_dataloader: DataLoader,\n",
+    "    speed_monitor: SpeedMonitorBase,\n",
+    ") -> None:\n",
+    "    model = state[\"model\"]\n",
+    "    optimizer = state[\"optimizer\"]\n",
+    "\n",
+    "    if val_dataloader is not None:\n",
+    "        validate(fabric, model, val_dataloader)  # sanity check\n",
+    "\n",
+    "    with torch.device(\"meta\"):\n",
+    "        meta_model = GPT(model.config)\n",
+    "        # \"estimated\" is not as precise as \"measured\". Estimated is optimistic but widely used in the wild.\n",
+    "        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,\n",
+    "        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead\n",
+    "        estimated_flops = estimate_flops(meta_model) * micro_batch_size\n",
+    "        fabric.print(\n",
+    "            f\"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}\"\n",
+    "        )\n",
+    "        x = torch.randint(0, 1, (micro_batch_size, model.max_seq_length))\n",
+    "        measured_flops = measure_flops(meta_model, x)\n",
+    "        fabric.print(\n",
+    "            f\"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}\"\n",
+    "        )\n",
+    "        del meta_model, x\n",
+    "\n",
+    "    total_lengths = 0\n",
+    "    total_t0 = time.perf_counter()\n",
+    "\n",
+    "    for state[\"iter_num\"], train_data in enumerate(train_dataloader, state[\"iter_num\"]):\n",
+    "        if state[\"iter_num\"] >= max_iters:\n",
+    "            checkpoint_path = out_dir / f\"iter-{state['iter_num']:06d}-ckpt.pth\"\n",
+    "            fabric.print(f\"Saving checkpoint to {str(checkpoint_path)!r}\")\n",
+    "            fabric.save(checkpoint_path, state)\n",
+    "            break\n",
+    "\n",
+    "        # determine and set the learning rate for this iteration\n",
+    "        lr = get_lr(state[\"iter_num\"]) if decay_lr else learning_rate\n",
+    "        for param_group in optimizer.param_groups:\n",
+    "            param_group[\"lr\"] = lr\n",
+    "\n",
+    "        iter_t0 = time.perf_counter()\n",
+    "\n",
+    "        input_ids = train_data[:, 0 : model.max_seq_length].contiguous()\n",
+    "        targets = train_data[:, 1 : model.max_seq_length + 1].contiguous()\n",
+    "\n",
+    "        is_accumulating = (state[\"iter_num\"] + 1) % gradient_accumulation_steps != 0\n",
+    "        with fabric.no_backward_sync(model, enabled=is_accumulating):\n",
+    "            logits = model(input_ids)\n",
+    "            loss = chunked_cross_entropy(logits, targets, chunk_size=0)\n",
+    "            fabric.backward(loss / gradient_accumulation_steps)\n",
+    "\n",
+    "        # return\n",
+    "\n",
+    "        if not is_accumulating:\n",
+    "            fabric.clip_gradients(model, optimizer, max_norm=grad_clip)\n",
+    "            optimizer.step()\n",
+    "            optimizer.zero_grad()\n",
+    "            state[\"step_count\"] += 1\n",
+    "\n",
+    "        t1 = time.perf_counter()\n",
+    "        total_lengths += input_ids.size(1)\n",
+    "        speed_monitor.on_train_batch_end(\n",
+    "            (state[\"iter_num\"] + 1) * micro_batch_size,\n",
+    "            t1 - total_t0,\n",
+    "            # this assumes that device FLOPs are the same and that all devices have the same batch size\n",
+    "            fabric.world_size,\n",
+    "            flops_per_batch=measured_flops,\n",
+    "            lengths=total_lengths,\n",
+    "        )\n",
+    "        if state[\"iter_num\"] % log_interval == 0:\n",
+    "            fabric.print(\n",
+    "                f\"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, LR: {lr:.6f}, iter time:\"\n",
+    "                f\" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}\"\n",
+    "            )\n",
+    "\n",
+    "        if (\n",
+    "            val_dataloader is not None\n",
+    "            and not is_accumulating\n",
+    "            and state[\"step_count\"] % eval_interval == 0\n",
+    "        ):\n",
+    "            t0 = time.perf_counter()\n",
+    "            val_loss = validate(fabric, model, val_dataloader)\n",
+    "            t1 = time.perf_counter() - t0\n",
+    "            speed_monitor.eval_end(t1)\n",
+    "            fabric.print(\n",
+    "                f\"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms\"\n",
+    "            )\n",
+    "            fabric.barrier()\n",
+    "        if not is_accumulating and state[\"step_count\"] % save_interval == 0:\n",
+    "            checkpoint_path = out_dir / f\"iter-{state['iter_num']:06d}-ckpt.pth\"\n",
+    "            fabric.print(f\"Saving checkpoint to {str(checkpoint_path)!r}\")\n",
+    "            fabric.save(checkpoint_path, state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@torch.inference_mode()\n",
+    "def validate(\n",
+    "    fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader\n",
+    ") -> torch.Tensor:\n",
+    "    fabric.print(\"Validating ...\")\n",
+    "    model.eval()\n",
+    "\n",
+    "    losses = torch.zeros(eval_iters, device=fabric.device)\n",
+    "    for k, val_data in enumerate(val_dataloader):\n",
+    "        input_ids = val_data[:, 0 : model.max_seq_length].contiguous()\n",
+    "        targets = val_data[:, 1 : model.max_seq_length + 1].contiguous()\n",
+    "        logits = model(input_ids)\n",
+    "        losses[k] = chunked_cross_entropy(logits, targets, chunk_size=0)\n",
+    "    out = losses.mean()\n",
+    "\n",
+    "    model.train()\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_dataloader(\n",
+    "    batch_size: int,\n",
+    "    block_size: int,\n",
+    "    data_dir: Path,\n",
+    "    fabric: L.Fabric,\n",
+    "    shuffle: bool = True,\n",
+    "    seed: int = 12345,\n",
+    ") -> DataLoader:\n",
+    "    datasets = []\n",
+    "    for prefix, _ in data_config:\n",
+    "        filenames = glob.glob(str(data_dir / f\"{prefix}*\"))\n",
+    "        dataset = PackedDataset(\n",
+    "            filenames,\n",
+    "            n_chunks=4,\n",
+    "            block_size=block_size,\n",
+    "            shuffle=shuffle,\n",
+    "            seed=seed,\n",
+    "            num_processes=fabric.world_size,\n",
+    "            process_rank=fabric.global_rank,\n",
+    "        )\n",
+    "        datasets.append(dataset)\n",
+    "\n",
+    "    if not datasets:\n",
+    "        raise RuntimeError(\n",
+    "            f\"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset.\"\n",
+    "        )\n",
+    "\n",
+    "    weights = [weight for _, weight in data_config]\n",
+    "    sum_weights = sum(weights)\n",
+    "    weights = [el / sum_weights for el in weights]\n",
+    "\n",
+    "    combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights)\n",
+    "\n",
+    "    return DataLoader(\n",
+    "        combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_dataloaders(\n",
+    "    batch_size: int,\n",
+    "    block_size: int,\n",
+    "    fabric: L.Fabric,\n",
+    "    train_data_dir: Path = Path(\"data/redpajama_sample\"),\n",
+    "    val_data_dir: Optional[Path] = None,\n",
+    "    seed: int = 12345,\n",
+    ") -> Tuple[DataLoader, DataLoader]:\n",
+    "    # Increase by one because we need the next word as well\n",
+    "    effective_block_size = block_size + 1\n",
+    "    train_dataloader = create_dataloader(\n",
+    "        batch_size=batch_size,\n",
+    "        block_size=effective_block_size,\n",
+    "        fabric=fabric,\n",
+    "        data_dir=train_data_dir,\n",
+    "        shuffle=True,\n",
+    "        seed=seed,\n",
+    "    )\n",
+    "    val_dataloader = (\n",
+    "        create_dataloader(\n",
+    "            batch_size=batch_size,\n",
+    "            block_size=effective_block_size,\n",
+    "            fabric=fabric,\n",
+    "            data_dir=val_data_dir,\n",
+    "            shuffle=False,\n",
+    "            seed=seed,\n",
+    "        )\n",
+    "        if val_data_dir\n",
+    "        else None\n",
+    "    )\n",
+    "    return train_dataloader, val_dataloader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_lr(it: int) -> float:\n",
+    "    # 1) linear warmup for warmup_iters steps\n",
+    "    if it < warmup_iters:\n",
+    "        return learning_rate * it / warmup_iters\n",
+    "    # 2) if it > lr_decay_iters, return min learning rate\n",
+    "    if it > lr_decay_iters:\n",
+    "        return min_lr\n",
+    "    # 3) in between, use cosine decay down to min learning rate\n",
+    "    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)\n",
+    "    assert 0 <= decay_ratio <= 1\n",
+    "    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1\n",
+    "    return min_lr + coeff * (learning_rate - min_lr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using bfloat16 Automatic Mixed Precision (AMP)\n",
+      "Seed set to 1337\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'model_name': 'pythia-160m', 'name': 'redpajama', 'save_interval': 1000, 'eval_interval': 1000, 'eval_iters': 100, 'log_interval': 100, 'learning_rate': 0.006, 'batch_size': 32, 'micro_batch_size': 8, 'gradient_accumulation_steps': 4, 'max_iters': 15000, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'decay_lr': True, 'warmup_iters': 2000, 'lr_decay_iters': 15000, 'min_lr': 6e-06}\n",
+      "Loading model with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m-deduped'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}\n",
+      "Time to instantiate model: 1.99 seconds.\n",
+      "Total parameters 162,322,944\n",
+      "Estimated TFLOPs: 22.14\n",
+      "Measured TFLOPs: 15.86\n",
+      "iter 0 step 0: loss 11.0478, LR: 0.000000, iter time: 1312.30ms\n",
+      "iter 100 step 25: loss 7.3711, LR: 0.000300, iter time: 282.00ms\n",
+      "iter 200 step 50: loss 5.9653, LR: 0.000600, iter time: 293.93ms\n",
+      "iter 300 step 75: loss 6.1456, LR: 0.000900, iter time: 290.72ms\n",
+      "iter 400 step 100: loss 6.4233, LR: 0.001200, iter time: 291.77ms\n",
+      "iter 500 step 125: loss 5.8922, LR: 0.001500, iter time: 292.98ms\n",
+      "iter 600 step 150: loss 5.7330, LR: 0.001800, iter time: 292.54ms\n",
+      "iter 700 step 175: loss 5.2412, LR: 0.002100, iter time: 293.18ms\n",
+      "iter 800 step 200: loss 4.7973, LR: 0.002400, iter time: 291.61ms\n",
+      "iter 900 step 225: loss 5.4157, LR: 0.002700, iter time: 292.85ms\n",
+      "iter 1000 step 250: loss 5.1732, LR: 0.003000, iter time: 292.74ms\n",
+      "iter 1100 step 275: loss 5.1144, LR: 0.003300, iter time: 291.97ms\n",
+      "iter 1200 step 300: loss 4.6204, LR: 0.003600, iter time: 291.41ms\n",
+      "iter 1300 step 325: loss 5.2649, LR: 0.003900, iter time: 292.33ms\n",
+      "iter 1400 step 350: loss 5.3906, LR: 0.004200, iter time: 291.61ms\n",
+      "iter 1500 step 375: loss 5.1544, LR: 0.004500, iter time: 292.87ms\n",
+      "iter 1600 step 400: loss 5.2281, LR: 0.004800, iter time: 291.19ms\n",
+      "iter 1700 step 425: loss 4.6215, LR: 0.005100, iter time: 290.65ms\n",
+      "iter 1800 step 450: loss 5.1470, LR: 0.005400, iter time: 291.07ms\n",
+      "iter 1900 step 475: loss 5.1262, LR: 0.005700, iter time: 291.85ms\n",
+      "iter 2000 step 500: loss 4.7982, LR: 0.006000, iter time: 291.74ms\n",
+      "iter 2100 step 525: loss 4.7870, LR: 0.005999, iter time: 291.40ms\n",
+      "iter 2200 step 550: loss 4.6758, LR: 0.005997, iter time: 291.24ms\n",
+      "iter 2300 step 575: loss 4.2770, LR: 0.005992, iter time: 290.94ms\n",
+      "iter 2400 step 600: loss 4.9993, LR: 0.005986, iter time: 290.82ms\n",
+      "iter 2500 step 625: loss 4.7006, LR: 0.005978, iter time: 291.72ms\n",
+      "iter 2600 step 650: loss 4.4606, LR: 0.005969, iter time: 291.41ms\n",
+      "iter 2700 step 675: loss 4.2507, LR: 0.005957, iter time: 291.65ms\n",
+      "iter 2800 step 700: loss 4.2737, LR: 0.005944, iter time: 298.98ms\n",
+      "iter 2900 step 725: loss 3.2729, LR: 0.005929, iter time: 291.06ms\n",
+      "iter 3000 step 750: loss 3.6851, LR: 0.005913, iter time: 290.95ms\n",
+      "iter 3100 step 775: loss 4.3133, LR: 0.005895, iter time: 291.41ms\n",
+      "iter 3200 step 800: loss 4.0082, LR: 0.005875, iter time: 290.55ms\n",
+      "iter 3300 step 825: loss 4.4818, LR: 0.005853, iter time: 291.40ms\n",
+      "iter 3400 step 850: loss 4.0966, LR: 0.005830, iter time: 291.75ms\n",
+      "iter 3500 step 875: loss 3.3417, LR: 0.005805, iter time: 291.56ms\n",
+      "iter 3600 step 900: loss 3.3930, LR: 0.005779, iter time: 291.98ms\n",
+      "iter 3700 step 925: loss 3.9926, LR: 0.005751, iter time: 291.38ms\n",
+      "iter 3800 step 950: loss 4.4130, LR: 0.005721, iter time: 290.98ms\n",
+      "iter 3900 step 975: loss 4.2273, LR: 0.005690, iter time: 290.82ms\n",
+      "Saving checkpoint to 'out/redpajama/iter-003999-ckpt.pth'\n",
+      "iter 4000 step 1000: loss 4.1836, LR: 0.005657, iter time: 289.39ms\n",
+      "iter 4100 step 1025: loss 3.8898, LR: 0.005622, iter time: 290.57ms\n",
+      "iter 4200 step 1050: loss 3.2994, LR: 0.005586, iter time: 290.66ms\n",
+      "iter 4300 step 1075: loss 3.5536, LR: 0.005549, iter time: 291.97ms\n",
+      "iter 4400 step 1100: loss 4.0568, LR: 0.005510, iter time: 290.74ms\n",
+      "iter 4500 step 1125: loss 4.0688, LR: 0.005469, iter time: 291.51ms\n",
+      "iter 4600 step 1150: loss 3.9602, LR: 0.005428, iter time: 291.69ms\n",
+      "iter 4700 step 1175: loss 3.9015, LR: 0.005384, iter time: 291.05ms\n",
+      "iter 4800 step 1200: loss 3.9838, LR: 0.005340, iter time: 290.89ms\n",
+      "iter 4900 step 1225: loss 4.1498, LR: 0.005294, iter time: 291.43ms\n",
+      "iter 5000 step 1250: loss 3.9890, LR: 0.005246, iter time: 292.04ms\n",
+      "iter 5100 step 1275: loss 3.7998, LR: 0.005198, iter time: 291.67ms\n",
+      "iter 5200 step 1300: loss 4.3898, LR: 0.005148, iter time: 292.07ms\n",
+      "iter 5300 step 1325: loss 3.8301, LR: 0.005096, iter time: 291.71ms\n",
+      "iter 5400 step 1350: loss 3.9250, LR: 0.005044, iter time: 291.87ms\n",
+      "iter 5500 step 1375: loss 3.4592, LR: 0.004990, iter time: 292.45ms\n",
+      "iter 5600 step 1400: loss 3.9057, LR: 0.004936, iter time: 292.48ms\n",
+      "iter 5700 step 1425: loss 3.4640, LR: 0.004880, iter time: 292.17ms\n",
+      "iter 5800 step 1450: loss 3.5189, LR: 0.004823, iter time: 291.53ms\n",
+      "iter 5900 step 1475: loss 3.8723, LR: 0.004765, iter time: 291.76ms\n",
+      "iter 6000 step 1500: loss 3.5505, LR: 0.004705, iter time: 291.40ms\n",
+      "iter 6100 step 1525: loss 2.7599, LR: 0.004645, iter time: 290.44ms\n",
+      "iter 6200 step 1550: loss 4.0639, LR: 0.004584, iter time: 290.73ms\n",
+      "iter 6300 step 1575: loss 3.9124, LR: 0.004522, iter time: 290.77ms\n",
+      "iter 6400 step 1600: loss 3.7831, LR: 0.004459, iter time: 290.48ms\n",
+      "iter 6500 step 1625: loss 3.6439, LR: 0.004396, iter time: 291.02ms\n",
+      "iter 6600 step 1650: loss 3.6231, LR: 0.004331, iter time: 293.27ms\n",
+      "iter 6700 step 1675: loss 3.4389, LR: 0.004266, iter time: 291.11ms\n",
+      "iter 6800 step 1700: loss 3.5385, LR: 0.004200, iter time: 290.80ms\n",
+      "iter 6900 step 1725: loss 3.4988, LR: 0.004133, iter time: 291.01ms\n",
+      "iter 7000 step 1750: loss 3.8966, LR: 0.004066, iter time: 290.56ms\n",
+      "iter 7100 step 1775: loss 3.6816, LR: 0.003998, iter time: 290.93ms\n",
+      "iter 7200 step 1800: loss 3.4510, LR: 0.003929, iter time: 291.20ms\n",
+      "iter 7300 step 1825: loss 3.9102, LR: 0.003860, iter time: 292.28ms\n",
+      "iter 7400 step 1850: loss 3.6360, LR: 0.003790, iter time: 291.56ms\n",
+      "iter 7500 step 1875: loss 3.8664, LR: 0.003720, iter time: 290.58ms\n",
+      "iter 7600 step 1900: loss 3.6073, LR: 0.003650, iter time: 291.40ms\n",
+      "iter 7700 step 1925: loss 2.9199, LR: 0.003579, iter time: 290.78ms\n",
+      "iter 7800 step 1950: loss 2.7844, LR: 0.003508, iter time: 290.67ms\n",
+      "iter 7900 step 1975: loss 3.1176, LR: 0.003436, iter time: 291.73ms\n",
+      "Saving checkpoint to 'out/redpajama/iter-007999-ckpt.pth'\n",
+      "iter 8000 step 2000: loss 3.7936, LR: 0.003364, iter time: 290.49ms\n",
+      "iter 8100 step 2025: loss 3.6197, LR: 0.003292, iter time: 290.46ms\n",
+      "iter 8200 step 2050: loss 3.7480, LR: 0.003220, iter time: 291.78ms\n",
+      "iter 8300 step 2075: loss 3.6900, LR: 0.003148, iter time: 291.11ms\n",
+      "iter 8400 step 2100: loss 2.8864, LR: 0.003075, iter time: 291.39ms\n",
+      "iter 8500 step 2125: loss 3.6963, LR: 0.003003, iter time: 291.51ms\n",
+      "iter 8600 step 2150: loss 3.7093, LR: 0.002931, iter time: 291.80ms\n",
+      "iter 8700 step 2175: loss 3.3042, LR: 0.002858, iter time: 290.53ms\n",
+      "iter 8800 step 2200: loss 3.0944, LR: 0.002786, iter time: 290.83ms\n",
+      "iter 8900 step 2225: loss 3.4312, LR: 0.002714, iter time: 290.81ms\n",
+      "iter 9000 step 2250: loss 3.5048, LR: 0.002642, iter time: 290.99ms\n",
+      "iter 9100 step 2275: loss 3.2803, LR: 0.002570, iter time: 291.00ms\n",
+      "iter 9200 step 2300: loss 3.5930, LR: 0.002498, iter time: 292.10ms\n",
+      "iter 9300 step 2325: loss 2.2495, LR: 0.002427, iter time: 290.29ms\n",
+      "iter 9400 step 2350: loss 2.9088, LR: 0.002356, iter time: 290.19ms\n",
+      "iter 9500 step 2375: loss 2.6597, LR: 0.002286, iter time: 291.29ms\n",
+      "iter 9600 step 2400: loss 3.6206, LR: 0.002216, iter time: 291.64ms\n",
+      "iter 9700 step 2425: loss 2.3134, LR: 0.002146, iter time: 289.83ms\n",
+      "iter 9800 step 2450: loss 2.4301, LR: 0.002077, iter time: 289.59ms\n",
+      "iter 9900 step 2475: loss 2.4800, LR: 0.002008, iter time: 290.77ms\n",
+      "iter 10000 step 2500: loss 2.2368, LR: 0.001940, iter time: 290.11ms\n",
+      "iter 10100 step 2525: loss 3.1508, LR: 0.001873, iter time: 291.03ms\n",
+      "iter 10200 step 2550: loss 3.2954, LR: 0.001806, iter time: 291.14ms\n",
+      "iter 10300 step 2575: loss 3.0130, LR: 0.001740, iter time: 291.20ms\n",
+      "iter 10400 step 2600: loss 3.0044, LR: 0.001675, iter time: 290.75ms\n",
+      "iter 10500 step 2625: loss 2.8596, LR: 0.001610, iter time: 290.14ms\n",
+      "iter 10600 step 2650: loss 2.0126, LR: 0.001547, iter time: 290.53ms\n",
+      "iter 10700 step 2675: loss 3.0040, LR: 0.001484, iter time: 292.51ms\n",
+      "iter 10800 step 2700: loss 3.4691, LR: 0.001422, iter time: 290.79ms\n",
+      "iter 10900 step 2725: loss 3.3719, LR: 0.001361, iter time: 291.21ms\n",
+      "iter 11000 step 2750: loss 2.9904, LR: 0.001301, iter time: 292.52ms\n",
+      "iter 11100 step 2775: loss 2.7121, LR: 0.001241, iter time: 291.23ms\n",
+      "iter 11200 step 2800: loss 3.2472, LR: 0.001183, iter time: 291.06ms\n",
+      "iter 11300 step 2825: loss 3.3517, LR: 0.001126, iter time: 291.27ms\n",
+      "iter 11400 step 2850: loss 3.2715, LR: 0.001070, iter time: 292.14ms\n",
+      "iter 11500 step 2875: loss 3.4200, LR: 0.001016, iter time: 290.81ms\n",
+      "iter 11600 step 2900: loss 3.4924, LR: 0.000962, iter time: 291.75ms\n",
+      "iter 11700 step 2925: loss 2.2736, LR: 0.000910, iter time: 290.48ms\n",
+      "iter 11800 step 2950: loss 3.1776, LR: 0.000858, iter time: 291.91ms\n",
+      "iter 11900 step 2975: loss 3.1710, LR: 0.000808, iter time: 291.62ms\n",
+      "Saving checkpoint to 'out/redpajama/iter-011999-ckpt.pth'\n",
+      "iter 12000 step 3000: loss 3.6688, LR: 0.000760, iter time: 290.94ms\n",
+      "iter 12100 step 3025: loss 3.0179, LR: 0.000712, iter time: 290.84ms\n",
+      "iter 12200 step 3050: loss 3.2257, LR: 0.000666, iter time: 291.06ms\n",
+      "iter 12300 step 3075: loss 3.1653, LR: 0.000622, iter time: 292.47ms\n",
+      "iter 12400 step 3100: loss 3.4042, LR: 0.000578, iter time: 291.42ms\n",
+      "iter 12500 step 3125: loss 3.1884, LR: 0.000537, iter time: 290.93ms\n",
+      "iter 12600 step 3150: loss 3.4705, LR: 0.000496, iter time: 291.49ms\n",
+      "iter 12700 step 3175: loss 3.5805, LR: 0.000457, iter time: 291.72ms\n",
+      "iter 12800 step 3200: loss 2.8953, LR: 0.000420, iter time: 292.49ms\n",
+      "iter 12900 step 3225: loss 3.3408, LR: 0.000384, iter time: 297.87ms\n",
+      "iter 13000 step 3250: loss 3.0779, LR: 0.000349, iter time: 298.95ms\n",
+      "iter 13100 step 3275: loss 2.5973, LR: 0.000316, iter time: 291.06ms\n",
+      "iter 13200 step 3300: loss 3.5901, LR: 0.000285, iter time: 291.16ms\n",
+      "iter 13300 step 3325: loss 2.4544, LR: 0.000255, iter time: 290.62ms\n",
+      "iter 13400 step 3350: loss 2.9969, LR: 0.000227, iter time: 290.56ms\n",
+      "iter 13500 step 3375: loss 3.1975, LR: 0.000201, iter time: 291.62ms\n",
+      "iter 13600 step 3400: loss 2.8946, LR: 0.000176, iter time: 290.60ms\n",
+      "iter 13700 step 3425: loss 3.4701, LR: 0.000153, iter time: 291.61ms\n",
+      "iter 13800 step 3450: loss 2.6274, LR: 0.000131, iter time: 289.90ms\n",
+      "iter 13900 step 3475: loss 3.3881, LR: 0.000111, iter time: 291.66ms\n",
+      "iter 14000 step 3500: loss 3.0832, LR: 0.000093, iter time: 291.88ms\n",
+      "iter 14100 step 3525: loss 3.2224, LR: 0.000077, iter time: 291.17ms\n",
+      "iter 14200 step 3550: loss 3.5854, LR: 0.000062, iter time: 290.77ms\n",
+      "iter 14300 step 3575: loss 3.3620, LR: 0.000049, iter time: 292.27ms\n",
+      "iter 14400 step 3600: loss 3.5590, LR: 0.000037, iter time: 291.91ms\n",
+      "iter 14500 step 3625: loss 3.2781, LR: 0.000028, iter time: 290.50ms\n",
+      "iter 14600 step 3650: loss 3.4279, LR: 0.000020, iter time: 291.54ms\n",
+      "iter 14700 step 3675: loss 2.8695, LR: 0.000014, iter time: 291.52ms\n",
+      "iter 14800 step 3700: loss 2.8212, LR: 0.000009, iter time: 291.34ms\n",
+      "iter 14900 step 3725: loss 3.3649, LR: 0.000007, iter time: 292.48ms\n",
+      "Saving checkpoint to 'out/redpajama/iter-015000-ckpt.pth'\n",
+      "Training time: 4615.15s\n",
+      "Memory used: 21.58 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch.set_float32_matmul_precision(\"medium\")\n",
+    "setup(devices=1, train_data_dir=Path(\"data/lit-redpajama-sample\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

out/redpajama/iter-003999-ckpt.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:396f17fb6dcf0dff11914ce7b427547fa35b9fe9691a70084ceefc3f6b1d2a69
+size 42205184

out/redpajama/iter-007999-ckpt.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c169e321ef26a1bcf3fe750aab25264f781c69e4763858824cb08979ebe7b13a
+size 41943040

out/redpajama/iter-011999-ckpt.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad33194d951debfaf63810e94385dc23b0379e058ee7d22f9d059038d8f137e7
+size 41943040

out/redpajama/lit_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"name": "pythia-160m", "hf_config": {"org": "EleutherAI", "name": "pythia-160m"}, "block_size": 2048, "vocab_size": 50254, "padding_multiple": 128, "padded_vocab_size": 50304, "n_layer": 12, "n_head": 12, "n_embd": 768, "rotary_percentage": 0.25, "parallel_residual": true, "bias": true, "lm_head_bias": false, "n_query_groups": 12, "shared_attention_norm": false, "_norm_class": "LayerNorm", "norm_eps": 1e-05, "_mlp_class": "GptNeoxMLP", "gelu_approximate": "none", "intermediate_size": 3072, "rope_condense_ratio": 1, "rope_base": 10000}

out/redpajama/lit_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aae789bf9e490f230f8347baf067918c95be2d71b47112e9e63476a1894a19ad
+size 44826624

out/redpajama/lit_model2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:568b2c0443dc4464590b9bab5953f53eadc9c4ae3bcd00679e59d924fa3f7778
+size 44826624

out/redpajama/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

out/redpajama/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

out/redpajama/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

out/redpajama/version_1/metrics.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.1.0
+lightning @ git+https://github.com/Lightning-AI/lightning@6cbe9ceb560d798892bdae9186291acf9bf5d2e3
+jsonargparse[signatures]  # CLI
+gradio
+sentencepiece

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tsai_gpt/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from tsai_gpt.model import GPT
+from tsai_gpt.config import Config
+from tsai_gpt.tokenizer import Tokenizer
+from lightning_utilities.core.imports import RequirementCache
+_LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.1.0.dev0")
+if not bool(_LIGHTNING_AVAILABLE):
+    raise ImportError(
+        "Lit-GPT requires lightning==2.1. Please run:\n"
+        f" pip uninstall -y lightning; pip install -r requirements.txt\n{str(_LIGHTNING_AVAILABLE)}"
+    )
+__all__ = ["GPT", "Config", "Tokenizer"]

tsai_gpt/config.py ADDED Viewed

	@@ -0,0 +1,1181 @@

+import json
+from copy import deepcopy
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal, Optional, Type, Union
+import torch
+from typing_extensions import Self
+import tsai_gpt.model
+from tsai_gpt.utils import find_multiple
+@dataclass
+class Config:
+    name: str = ""
+    hf_config: dict = field(default_factory=dict)
+    block_size: int = 4096
+    vocab_size: int = 50254
+    padding_multiple: int = 512
+    padded_vocab_size: Optional[int] = None
+    n_layer: int = 16
+    n_head: int = 32
+    n_embd: int = 4096
+    rotary_percentage: float = 0.25
+    parallel_residual: bool = True
+    bias: bool = True
+    lm_head_bias: bool = False
+    # to use multi-head attention (MHA), set this to `n_head` (default)
+    # to use multi-query attention (MQA), set this to 1
+    # to use grouped-query attention (GQA), set this to a value in between
+    # Example with `n_head=4`
+    # ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
+    # │ v ││ v ││ v ││ v │     │ v │    │ v │             │ v │
+    # └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
+    #   │    │    │    │         │        │                 │
+    # ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
+    # │ k ││ k ││ k ││ k │     │ k │    │ k │             │ k │
+    # └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
+    #   │    │    │    │      ┌──┴──┐  ┌──┴──┐      ┌────┬──┴─┬────┐
+    # ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐
+    # │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │
+    # └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘
+    # ◀──────────────────▶  ◀──────────────────▶  ◀──────────────────▶
+    #         MHA                    GQA                   MQA
+    #   n_query_groups=4       n_query_groups=2      n_query_groups=1
+    #
+    # credit https://arxiv.org/pdf/2305.13245.pdf
+    n_query_groups: Optional[int] = None
+    shared_attention_norm: bool = False
+    _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
+    norm_eps: float = 1e-5
+    _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
+    gelu_approximate: str = "none"
+    intermediate_size: Optional[int] = None
+    rope_condense_ratio: int = 1
+    rope_base: int = 10000
+    def __post_init__(self):
+        if not self.name:
+            self.name = self.hf_config.get("name", self.name)
+        assert self.n_embd % self.n_head == 0
+        self.head_size = self.n_embd // self.n_head
+        # vocab size should be a power of 2 to be optimal on hardware. compute the closest value
+        if self.padded_vocab_size is None:
+            self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple)
+        else:
+            # vocab size shouldn't be larger than padded vocab size
+            self.vocab_size = min(self.vocab_size, self.padded_vocab_size)
+        # compute the number of query groups
+        if self.n_query_groups is not None:
+            assert self.n_head % self.n_query_groups == 0
+        else:
+            self.n_query_groups = self.n_head
+        # compute the intermediate size for MLP if not set
+        if self.intermediate_size is None:
+            if self._mlp_class == "LLaMAMLP":
+                raise ValueError("The config needs to set the `intermediate_size`")
+            self.intermediate_size = 4 * self.n_embd
+        self.rope_n_elem = int(self.rotary_percentage * self.head_size)
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        if name not in name_to_config:
+            # search through all `config['hf_config']['name']`
+            conf_dict = next(config for config in configs if name == config["hf_config"]["name"])
+        else:
+            conf_dict = name_to_config[name]
+        conf_dict = conf_dict.copy()
+        if "condense_ratio" in kwargs:  # legacy name
+            kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
+        conf_dict.update(kwargs)
+        return cls(**conf_dict)
+    @classmethod
+    def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self:
+        with open(path, encoding="utf-8") as fp:
+            json_kwargs = json.load(fp)
+        if "condense_ratio" in json_kwargs:  # legacy name
+            json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio")
+        if "condense_ratio" in kwargs:  # legacy name
+            kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
+        if "org" in json_kwargs:  # legacy name
+            json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")}
+        if "org" in kwargs:  # legacy name
+            kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")}
+        json_kwargs.update(kwargs)
+        return cls(**json_kwargs)
+    @property
+    def mlp_class(self) -> Type:
+        # `self._mlp_class` cannot be the type to keep the config json serializable
+        return getattr(tsai_gpt.model, self._mlp_class)
+    @property
+    def norm_class(self) -> Type:
+        # `self._norm_class` cannot be the type to keep the config json serializable
+        if self._norm_class == "RMSNorm":
+            from tsai_gpt.rmsnorm import RMSNorm
+            return RMSNorm
+        return getattr(torch.nn, self._norm_class)
+########################
+# Stability AI StableLM
+########################
+configs = [
+    # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json
+    dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
+    # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
+    dict(
+        name="stablelm-base-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
+    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
+    dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
+    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
+    dict(
+        name="stablelm-tuned-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
+]
+####################
+# EleutherAI Pythia
+####################
+pythia = [
+    # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
+    dict(
+        name="pythia-70m",
+        hf_config=dict(org="EleutherAI", name="pythia-70m"),
+        block_size=2048,
+        n_layer=6,
+        n_embd=512,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
+    dict(
+        name="pythia-160m",
+        hf_config=dict(org="EleutherAI", name="pythia-160m"),
+        block_size=2048,
+        n_layer=12,
+        n_embd=768,
+        n_head=12,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
+    dict(
+        name="pythia-410m",
+        hf_config=dict(org="EleutherAI", name="pythia-410m"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=1024,
+        n_head=16,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
+    dict(
+        name="pythia-1b",
+        hf_config=dict(org="EleutherAI", name="pythia-1b"),
+        block_size=2048,
+        n_embd=2048,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
+    dict(
+        name="pythia-1.4b",
+        hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=2048,
+        n_head=16,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
+    dict(
+        name="pythia-2.8b",
+        hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
+    dict(
+        name="pythia-6.9b",
+        hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
+    dict(
+        name="pythia-12b",
+        hf_config=dict(org="EleutherAI", name="pythia-12b"),
+        block_size=2048,
+        n_layer=36,
+        n_embd=5120,
+        n_head=40,
+    ),
+]
+configs.extend(pythia)
+for c in pythia:
+    copy = c.copy()
+    copy["name"] = f"{c['name']}-deduped"
+    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
+    configs.append(copy)
+####################################
+# togethercomputer RedPajama INCITE
+####################################
+redpajama_incite = [
+    # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json
+    dict(
+        name="RedPajama-INCITE-{}-3B-v1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+    # https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json
+    dict(
+        name="RedPajama-INCITE-7B-{}",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+    # this redirects to the checkpoint above. kept for those who had the old weights already downloaded
+    dict(
+        name="RedPajama-INCITE-{}-7B-v0.1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+]
+for c in redpajama_incite:
+    for kind in ("Base", "Chat", "Instruct"):
+        copy = c.copy()
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+#################
+# TII UAE Falcon
+#################
+falcon = [
+    # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
+    dict(
+        name="falcon-7b{}",
+        hf_config=dict(org="tiiuae", name="falcon-7b{}"),
+        block_size=2048,
+        vocab_size=65024,
+        padded_vocab_size=65024,
+        n_layer=32,
+        n_head=71,
+        n_embd=4544,
+        rotary_percentage=1.0,
+        n_query_groups=1,
+        bias=False,
+        # this is not in the config, but in the original model implementation, only for this config
+        shared_attention_norm=True,
+    ),
+    # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
+    dict(
+        name="falcon-40b{}",
+        hf_config=dict(org="tiiuae", name="falcon-40b{}"),
+        block_size=2048,
+        vocab_size=65024,
+        padded_vocab_size=65024,
+        n_layer=60,
+        n_head=128,
+        n_embd=8192,
+        rotary_percentage=1.0,
+        n_query_groups=8,
+        bias=False,
+    ),
+]
+for c in falcon:
+    for kind in ("", "-instruct"):
+        copy = c.copy()
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
+falcon180b = dict(
+    name="falcon-180B{}",
+    hf_config=dict(org="tiiuae", name="falcon-180B{}"),
+    block_size=2048,
+    vocab_size=65024,
+    padded_vocab_size=65024,
+    n_layer=80,
+    n_head=232,
+    n_embd=14848,
+    rotary_percentage=1.0,
+    n_query_groups=8,
+    bias=False,
+)
+for kind in ("", "-chat"):
+    copy = falcon180b.copy()
+    copy["name"] = falcon180b["name"].format(kind)
+    copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
+    configs.append(copy)
+#############################
+# OpenLM Research Open LLaMA
+#############################
+open_LLaMA = [
+    # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
+    dict(
+        name="open_llama_3b",
+        hf_config=dict(org="openlm-research", name="open_llama_3b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=26,
+        n_embd=3200,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=8640,
+    ),
+    # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
+    dict(
+        name="open_llama_7b",
+        hf_config=dict(org="openlm-research", name="open_llama_7b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
+    dict(
+        name="open_llama_13b",
+        hf_config=dict(org="openlm-research", name="open_llama_13b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+]
+configs.extend(open_LLaMA)
+###############
+# LMSYS Vicuna
+###############
+vicuna = [
+    # https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-33b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=60,
+        n_head=52,
+        n_embd=6656,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=17920,
+    ),
+    # https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=4,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_condense_ratio=4,
+    ),
+]
+configs.extend(vicuna)
+#################
+# LMSYS LongChat
+#################
+long_chat = [
+    # https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json
+    dict(
+        name="longchat-7b-16k",
+        hf_config=dict(org="lmsys", name="longchat-7b-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=8,
+    ),
+    # https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json
+    dict(
+        name="longchat-13b-16k",
+        hf_config=dict(org="lmsys", name="longchat-13b-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_condense_ratio=8,
+    ),
+]
+configs.extend(long_chat)
+######################
+# NousResearch Hermes
+######################
+nous_research = [
+    # https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json
+    dict(
+        name="Nous-Hermes-llama-2-7b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
+        padded_vocab_size=32000,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json
+    dict(
+        name="Nous-Hermes-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
+        block_size=2048,
+        vocab_size=32000,
+        padded_vocab_size=32001,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b
+    dict(
+        name="Nous-Hermes-Llama2-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
+        vocab_size=32000,
+        padded_vocab_size=32032,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+]
+configs.extend(nous_research)
+###############
+# Meta LLaMA 2
+###############
+llama_2 = [
+    # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-7b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-13b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-70b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+]
+for c in llama_2:
+    for kind in ("", "-chat"):
+        copy = c.copy()
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+##########################
+# Stability AI FreeWilly2
+##########################
+freewilly_2 = [
+    # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
+    dict(
+        name="FreeWilly2",
+        hf_config=dict(org="stabilityai", name="FreeWilly2"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    )
+]
+configs.extend(freewilly_2)
+##################
+# Meta Code Llama
+##################
+code_llama = [
+    # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-7b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-7b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/tree/main/config.json
+    dict(
+        name="CodeLlama-7b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
+        block_size=2048,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+]
+configs.extend(code_llama)
+########################
+# garage-bAInd Platypus
+########################
+platypus = [
+    # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
+    dict(
+        name="Platypus-30B",
+        hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
+        block_size=2048,
+        padded_vocab_size=32000,
+        n_layer=60,
+        n_head=52,
+        n_embd=6656,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-06,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=17920,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
+    dict(
+        name="Platypus2-7B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
+        padded_vocab_size=32000,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
+    dict(
+        name="Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
+    dict(
+        name="Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+    # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
+    dict(
+        name="Camel-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
+    dict(
+        name="Camel-Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+    # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
+    dict(
+        name="Stable-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
+    dict(
+        name="Platypus2-70B-instruct",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+]
+configs.extend(platypus)
+##########################
+# Stability AI StableCode
+##########################
+stablecode = [
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
+        block_size=16384,
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b-4k",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-instruct-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+]
+configs.extend(stablecode)
+##################################
+# togethercomputer LLaMA-2-7B-32K
+##################################
+together_llama2_32k = [
+    # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
+    dict(
+        name="LLaMA-2-7B-32K",
+        hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=8,
+    )
+]
+configs.extend(together_llama2_32k)
+################
+# Microsoft Phi
+################
+phi = [
+    # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
+    dict(
+        name="phi-1_5",
+        hf_config=dict(org="microsoft", name="phi-1_5"),
+        vocab_size=50257,
+        padded_vocab_size=51200,
+        block_size=2048,
+        n_embd=2048,
+        n_layer=24,
+        rotary_percentage=0.5,  # 32 / (n_embd / n_head) = 32 / 64
+        shared_attention_norm=True,
+        lm_head_bias=True,
+        gelu_approximate="tanh",
+    )
+]
+configs.extend(phi)
+#############
+# Mistral AI
+#############
+mistral = [
+    # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+    dict(
+        name="Mistral-7B-{}v0.1",
+        hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
+        padded_vocab_size=32000,
+        block_size=4096,  # should be 32768 but sliding window attention is not implemented
+        n_layer=32,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=14336,
+    )
+]
+for c in mistral:
+    for kind in ("", "Instruct-"):
+        copy = c.copy()
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+############
+# TinyLlama
+############
+tiny_llama = [
+    dict(
+        name="tiny-llama-1.1b",
+        hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=22,
+        n_head=32,
+        n_embd=2048,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",  # original TinyLlama uses FusedRMSNorm
+        norm_eps=1e-5,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=5632,
+        n_query_groups=4,
+    ),
+    dict(
+        name="tiny-llama-new",
+        hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
+        block_size=768,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=18,
+        n_head=32,
+        n_embd=1024,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",  # original TinyLlama uses FusedRMSNorm
+        norm_eps=1e-5,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=5632,
+        n_query_groups=4,
+    ),
+]
+configs.extend(tiny_llama)
+name_to_config = {config["name"]: config for config in configs}

tsai_gpt/model.py ADDED Viewed

	@@ -0,0 +1,342 @@

+"""Full definition of a GPT NeoX Language Model, all of it in this single file.
+Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and
+https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model.
+"""
+import math
+from typing import Any, Optional, Tuple
+import torch
+import torch.nn as nn
+from typing_extensions import Self
+from tsai_gpt.config import Config
+class GPT(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        assert config.padded_vocab_size is not None
+        self.config = config
+        self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias)
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config) for _ in range(config.n_layer)),
+                ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
+            )
+        )
+        self.max_seq_length = self.config.block_size
+        self.mask_cache: Optional[torch.Tensor] = None
+    @property
+    def max_seq_length(self) -> int:
+        return self._max_seq_length
+    @max_seq_length.setter
+    def max_seq_length(self, value: int) -> None:
+        """
+        When doing inference, the sequences used might be shorter than the model's context length.
+        This allows setting a smaller number to avoid allocating unused memory
+        """
+        if value > self.config.block_size:
+            raise ValueError(f"Cannot attend to {value}, block size is only {self.config.block_size}")
+        self._max_seq_length = value
+        if not hasattr(self, "cos"):
+            # first call
+            cos, sin = self.rope_cache()
+            self.register_buffer("cos", cos, persistent=False)
+            self.register_buffer("sin", sin, persistent=False)
+        elif value != self.cos.size(0):
+            # override
+            self.cos, self.sin = self.rope_cache(device=self.cos.device)
+        # the mask and kv cache size will get updated on `set_kv_cache`. we cannot update it here because we don't know
+        # if the kv cache is expected
+    def reset_parameters(self) -> None:
+        # Trigger resetting the rope-cache
+        self.max_seq_length = self.config.block_size
+    def _init_weights(self, module: nn.Module) -> None:
+        """Meant to be used with `gpt.apply(gpt._init_weights)`."""
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        T = idx.size(1)
+        if self.max_seq_length < T:
+            raise ValueError(f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}.")
+        if input_pos is not None:  # use the kv cache
+            cos = self.cos.index_select(0, input_pos)
+            sin = self.sin.index_select(0, input_pos)
+            if self.mask_cache is None:
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            mask = self.mask_cache.index_select(2, input_pos)
+        else:
+            cos = self.cos[:T]
+            sin = self.sin[:T]
+            mask = None
+        x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        for block in self.transformer.h:
+            x = block(x, cos, sin, mask, input_pos)
+        x = self.transformer.ln_f(x)
+        return self.lm_head(x)  # (b, t, vocab_size)
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        return cls(Config.from_name(name, **kwargs))
+    def rope_cache(self, device: Optional[torch.device] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        return build_rope_cache(
+            seq_len=self.max_seq_length,
+            n_elem=self.config.rope_n_elem,
+            device=device,
+            condense_ratio=self.config.rope_condense_ratio,
+            base=self.config.rope_base,
+        )
+    def set_kv_cache(
+        self,
+        batch_size: int,
+        rope_cache_length: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        if rope_cache_length is None:
+            rope_cache_length = self.cos.size(-1)
+        max_seq_length = self.max_seq_length
+        # initialize the kv cache for all blocks
+        for block in self.transformer.h:
+            block.attn.kv_cache = block.attn.build_kv_cache(
+                batch_size, max_seq_length, rope_cache_length, device, dtype
+            )
+        if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length:
+            # passing `attn_mask` to SDPA downgrades it to use the inefficient implementation. since we only need the mask
+            # for the kv-cache support (only during inference), we only create it in that situation
+            # this will be resolved by https://github.com/pytorch/pytorch/issues/96099
+            ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool)
+            self.mask_cache = torch.tril(ones).unsqueeze(0).unsqueeze(0)
+    def clear_kv_cache(self) -> None:
+        self.mask_cache = None
+        for block in self.transformer.h:
+            block.attn.kv_cache = None
+class Block(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.attn = CausalSelfAttention(config)
+        self.norm_2 = None if config.shared_attention_norm else config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.mlp = config.mlp_class(config)
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        n_1 = self.norm_1(x)
+        h = self.attn(n_1, cos, sin, mask, input_pos)
+        if self.config.parallel_residual:
+            n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
+            x = self.mlp(n_2) + h + x
+        else:
+            if self.config.shared_attention_norm:
+                raise NotImplementedError(
+                    "No checkpoint amongst the ones we support uses this configuration"
+                    " (non-parallel residual and shared attention norm)."
+                )
+            x = h + x
+            x = self.mlp(self.norm_2(x)) + x
+        return x
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
+        # key, query, value projections for all heads, but in a batch
+        self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # disabled by default
+        self.kv_cache: Optional[KVCache] = None
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        qkv = self.attn(x)
+        # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`)
+        q_per_kv = self.config.n_head // self.config.n_query_groups
+        total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
+        qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size)
+        qkv = qkv.permute(0, 2, 3, 1, 4)  # (B, n_query_groups, total_qkv, T, hs)
+        # split batched computation into three
+        q, k, v = qkv.split((q_per_kv, 1, 1), dim=2)
+        # maybe repeat k and v if for the non multi-head attention cases
+        # training: flash attention requires it
+        # inference: multi-query would require a full kv cache so avoid it to limit its memory usage
+        if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1):
+            k = k.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
+            v = v.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
+        q = q.reshape(B, -1, T, self.config.head_size)  # (B, nh_q, T, hs)
+        k = k.reshape(B, -1, T, self.config.head_size)  # (B, nh_k, T, hs)
+        v = v.reshape(B, -1, T, self.config.head_size)  # (B, nh_v, T, hs)
+        q_roped = apply_rope(q[..., : self.config.rope_n_elem], cos, sin)
+        k_roped = apply_rope(k[..., : self.config.rope_n_elem], cos, sin)
+        q = torch.cat((q_roped, q[..., self.config.rope_n_elem :]), dim=-1)
+        k = torch.cat((k_roped, k[..., self.config.rope_n_elem :]), dim=-1)
+        if input_pos is not None:
+            if not isinstance(self.kv_cache, KVCache):
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            k, v = self.kv_cache(input_pos, k, v)
+        y = self.scaled_dot_product_attention(q, k, v, mask)
+        y = y.reshape(B, T, C)  # re-assemble all head outputs side by side
+        # output projection
+        return self.proj(y)
+    def scaled_dot_product_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        scale = 1.0 / math.sqrt(self.config.head_size)
+        y = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None
+        )
+        return y.transpose(1, 2)
+    def build_kv_cache(
+        self,
+        batch_size: int,
+        max_seq_length: int,
+        rope_cache_length: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "KVCache":
+        heads = 1 if self.config.n_query_groups == 1 else self.config.n_head
+        v_shape = (batch_size, heads, max_seq_length, self.config.head_size)
+        if rope_cache_length is None:
+            if self.config.rotary_percentage != 1.0:
+                raise TypeError("Please pass the `rope_cache_length=gpt.cos.size(-1)` value")
+            k_shape = v_shape
+        else:
+            k_shape = (
+                batch_size,
+                heads,
+                max_seq_length,
+                rope_cache_length + self.config.head_size - self.config.rope_n_elem,
+            )
+        return KVCache(k_shape, v_shape, device=device, dtype=dtype)
+class GptNeoxMLP(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+        self.config = config
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc(x)
+        x = torch.nn.functional.gelu(x, approximate=self.config.gelu_approximate)
+        return self.proj(x)
+class LLaMAMLP(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.silu(x_fc_1) * x_fc_2
+        return self.proj(x)
+def build_rope_cache(
+    seq_len: int, n_elem: int, device: Optional[torch.device] = None, base: int = 10000, condense_ratio: int = 1
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Enhanced Transformer with Rotary Position Embedding.
+    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+    transformers/rope/__init__.py. MIT License:
+    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+    """
+    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+    # Create position indexes `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, device=device) / condense_ratio
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta).repeat(1, 2)
+    return torch.cos(idx_theta), torch.sin(idx_theta)
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    head_size = x.size(-1)
+    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
+    x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
+    rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
+    roped = (x * cos) + (rotated * sin)
+    return roped.type_as(x)
+class KVCache(nn.Module):
+    def __init__(
+        self,
+        k_shape: Tuple[int, int, int, int],
+        v_shape: Tuple[int, int, int, int],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+        self.register_buffer("k", torch.zeros(k_shape, device=device, dtype=dtype), persistent=False)
+        self.register_buffer("v", torch.zeros(v_shape, device=device, dtype=dtype), persistent=False)
+    def forward(self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # move the buffer to the activation dtype for when AMP is used
+        self.k = self.k.to(k.dtype)
+        self.v = self.v.to(v.dtype)
+        # update the cache
+        k = self.k.index_copy_(2, input_pos, k)
+        v = self.v.index_copy_(2, input_pos, v)
+        return k, v

tsai_gpt/packed_dataset.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Very loosely inspired by indexed_dataset in Fairseq, Megatron
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py
+import os
+import random
+import struct
+import numpy as np
+import torch
+from torch.utils.data import IterableDataset, get_worker_info
+dtypes = {1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float32, 7: np.float64, 8: np.uint16}
+def code(dtype):
+    for k in dtypes:
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+HDR_MAGIC = b"LITPKDS"
+HDR_SIZE = 24  # bytes
+class PackedDataset(IterableDataset):
+    def __init__(
+        self, filenames, n_chunks, block_size, seed=12345, shuffle=True, wrap=False, num_processes=1, process_rank=0
+    ):
+        self._filenames = filenames
+        self._n_chunks = n_chunks
+        self._block_size = block_size
+        self._seed = seed
+        self._shuffle = shuffle
+        self._wrap = wrap
+        self._num_processes = num_processes
+        self._process_rank = process_rank
+    def __iter__(self):
+        worker_info = get_worker_info()
+        num_workers = worker_info.num_workers if worker_info is not None else 1
+        worker_id = worker_info.id if worker_info is not None else 0
+        num_shards = num_workers * self._num_processes
+        shard_id = self._process_rank * num_workers + worker_id
+        max_num_files = len(self._filenames) // num_shards * num_shards
+        filenames = self._filenames[shard_id:max_num_files:num_shards]
+        return PackedDatasetIterator(
+            filenames=filenames,
+            n_chunks=self._n_chunks,
+            block_size=self._block_size,
+            seed=self._seed,
+            shuffle=self._shuffle,
+            wrap=self._wrap,
+        )
+class PackedDatasetBuilder(object):
+    def __init__(self, outdir, prefix, chunk_size, sep_token, dtype="auto", vocab_size=None):
+        if dtype == "auto":
+            if vocab_size is None:
+                raise ValueError("vocab_size cannot be None when dtype='auto'")
+            if vocab_size is not None and vocab_size < 65500:
+                self._dtype = np.uint16
+            else:
+                self._dtype = np.int32
+        else:
+            self._dtype = dtype
+        self._counter = 0
+        self._chunk_size = chunk_size
+        self._outdir = outdir
+        self._prefix = prefix
+        self._sep_token = sep_token
+        self._arr = np.zeros(self._chunk_size, dtype=self._dtype)
+        self._arr.fill(self._sep_token)
+        self._idx = 0
+        self._version = 1
+        self._filenames = []
+    def _write_chunk(self):
+        filename = f"{self._prefix}_{self._counter:010d}.bin"
+        filename = os.path.join(self._outdir, filename)
+        with open(filename, "wb") as f:
+            f.write(HDR_MAGIC)
+            f.write(struct.pack("<Q", self._version))
+            f.write(struct.pack("<B", code(self._dtype)))
+            f.write(struct.pack("<Q", self._chunk_size))
+            f.write(self._arr.tobytes(order="C"))
+        self._filenames.append(filename)
+        self._counter += 1
+        self._arr.fill(self._sep_token)
+        self._idx = 0
+    @property
+    def dtype(self):
+        return self._dtype
+    @property
+    def filenames(self):
+        return self._filenames.copy()
+    def add_array(self, arr):
+        while self._idx + arr.shape[0] > self._chunk_size:
+            part_len = self._chunk_size - self._idx
+            self._arr[self._idx : self._idx + part_len] = arr[:part_len]
+            self._write_chunk()
+            arr = arr[part_len:]
+        arr_len = arr.shape[0]
+        self._arr[self._idx : self._idx + arr_len] = arr
+        self._idx += arr_len
+    def write_reminder(self):
+        self._write_chunk()
+class PackedDatasetIterator:
+    def __init__(self, filenames, n_chunks, block_size, seed, shuffle, wrap):
+        self._seed = seed
+        self._shuffle = shuffle
+        self._rng = np.random.default_rng(seed) if shuffle else None
+        self._block_idxs = None
+        self._wrap = wrap
+        # TODO: instead of filenames, we could have a single text stream
+        #       (or text file) with the sequence of all files to be
+        #       fetched/loaded.
+        self._filenames = filenames
+        self._file_idx = 0
+        self._n_chunks = n_chunks
+        self._dtype = None
+        self._block_size = block_size
+        self._n_blocks = None
+        self._mmaps = []
+        self._buffers = []
+        self._block_idxs = []
+        self._curr_idx = 0
+        self._load_n_chunks()
+    def _read_header(self, path):
+        with open(path, "rb") as f:
+            magic = f.read(len(HDR_MAGIC))
+            assert magic == HDR_MAGIC, "File doesn't match expected format."
+            version = struct.unpack("<Q", f.read(8))
+            assert version == (1,)
+            (dtype_code,) = struct.unpack("<B", f.read(1))
+            dtype = dtypes[dtype_code]
+            (chunk_size,) = struct.unpack("<Q", f.read(8))
+        return dtype, chunk_size
+    def _close_mmaps(self):
+        for mmap in self._mmaps:
+            mmap._mmap.close()
+    def _load_n_chunks(self):
+        self._close_mmaps()
+        self._mmaps = []
+        self._buffers = []
+        if self._n_chunks > len(self._filenames[self._file_idx :]):
+            if not self._wrap:
+                raise StopIteration
+            self._file_idx = 0
+        for i in range(self._n_chunks):
+            filename = self._filenames[self._file_idx + i]
+            if self._dtype is None:
+                self._dtype, self._chunk_size = self._read_header(filename)
+                self._n_blocks = self._chunk_size // self._block_size
+            # TODO: check header matches with previous files
+            mmap = np.memmap(filename, mode="r", order="C", offset=HDR_SIZE)
+            self._mmaps.append(mmap)
+            self._buffers.append(memoryview(mmap))
+        self._file_idx += self._n_chunks
+        n_all_blocks = self._n_chunks * self._n_blocks
+        self._block_idxs = self._rng.permutation(n_all_blocks) if self._shuffle else range(n_all_blocks)
+        self._curr_idx = 0
+    def __del__(self):
+        self._close_mmaps()
+        del self._mmaps
+        del self._buffers
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self._curr_idx >= len(self._block_idxs):
+            self._load_n_chunks()
+            # TODO: trigger fetching next next n_chunks if remote
+        block_idx = self._block_idxs[self._curr_idx]
+        chunk_id = block_idx // self._n_blocks
+        buffer = self._buffers[chunk_id]
+        elem_id = (block_idx % self._n_blocks) * self._block_size
+        offset = np.dtype(self._dtype).itemsize * elem_id
+        arr = np.frombuffer(buffer, dtype=self._dtype, count=self._block_size, offset=offset)
+        self._curr_idx += 1
+        return torch.from_numpy(arr.astype(np.int64))
+class CombinedDataset(IterableDataset):
+    def __init__(self, datasets, seed, weights=None):
+        self._seed = seed
+        self._datasets = datasets
+        self._weights = weights
+        n_datasets = len(datasets)
+        if weights is None:
+            self._weights = [1 / n_datasets] * n_datasets
+    def __iter__(self):
+        return CombinedDatasetIterator(self._datasets, self._seed, self._weights)
+class CombinedDatasetIterator:
+    def __init__(self, datasets, seed, weights):
+        self._datasets = [iter(el) for el in datasets]
+        self._weights = weights
+        self._rng = random.Random(seed)
+    def __next__(self):
+        (dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1)
+        return next(dataset)

tsai_gpt/rmsnorm.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
+    https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
+    """
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(size))
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        dtype = x.dtype
+        x = x.float()
+        # NOTE: the original RMSNorm paper implementation is not equivalent
+        norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
+        x_normed = x * torch.rsqrt(norm_x + self.eps)
+        return (self.weight * x_normed).to(dtype=dtype)
+    def reset_parameters(self) -> None:
+        torch.nn.init.ones_(self.weight)

tsai_gpt/speed_monitor.py ADDED Viewed

	@@ -0,0 +1,425 @@

+import time
+from collections import deque
+from contextlib import nullcontext
+from typing import Any, Callable, Deque, Dict, Optional
+import torch
+from lightning import Callback, Fabric, LightningModule, Trainer
+from lightning.fabric.accelerators.xla import _XLA_GREATER_EQUAL_2_1
+from lightning.fabric.plugins import (
+    BitsandbytesPrecision,
+    DoublePrecision,
+    FSDPPrecision,
+    HalfPrecision,
+    MixedPrecision,
+    Precision,
+    TransformerEnginePrecision,
+    XLAPrecision,
+)
+from lightning.fabric.utilities.rank_zero import rank_zero_only as fabric_rank_zero_only
+from lightning.pytorch.plugins import (
+    DoublePrecisionPlugin,
+    FSDPPrecisionPlugin,
+    HalfPrecisionPlugin,
+    MixedPrecisionPlugin,
+    XLAPrecisionPlugin,
+)
+from lightning.pytorch.utilities.rank_zero import rank_zero_only as trainer_rank_zero_only
+from torch.utils.flop_counter import FlopCounterMode
+from tsai_gpt import GPT
+from tsai_gpt.utils import num_parameters
+GPU_AVAILABLE_FLOPS = {
+    # source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
+    # nvidia publishes spec sheet with a 2x sparsity factor
+    "h100-sxm": {
+        torch.float64: 67e12,
+        torch.float32: 67e12,
+        torch.bfloat16: 1.979e15 / 2,
+        torch.float16: 1.979e15 / 2,
+        torch.int8: 3.958e15 / 2,
+    },
+    "h100-pcie": {
+        torch.float64: 51e12,
+        torch.float32: 51e12,
+        torch.bfloat16: 1.513e15 / 2,
+        torch.float16: 1.513e15 / 2,
+        torch.int8: 3.026e15 / 2,
+    },
+    # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
+    # sxm and pcie have same flop counts
+    "a100": {torch.float64: 19.5e12, torch.float32: 19.5e12, torch.bfloat16: 312e12, torch.float16: 312e12},
+    # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf
+    "a10g": {torch.float32: 31.2e12, torch.bfloat16: 125e12, torch.float16: 125e12},
+    # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf
+    "v100-sxm": {torch.float64: 7.8e12, torch.float32: 15.7e12, torch.float16: 125e12},
+    "v100-pcie": {torch.float64: 7e12, torch.float32: 14e12, torch.float16: 112e12},
+    "v100s-pcie": {torch.float64: 8.2e12, torch.float32: 16.4e12, torch.float16: 130e12},
+    # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf
+    # sxm and pcie have same flop counts
+    "t4": {torch.float32: 8.1e12, torch.float16: 65e12, torch.int8: 130e12},
+    # https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf
+    "quadro rtx 5000": {torch.float32: 11.2e12, torch.float16: 89.2e12},
+}
+TPU_AVAILABLE_FLOPS = {
+    # flop count for each TPU generation is the same for all precisions
+    # since bfloat16 precision is always used for performing matrix operations
+    # for more info: https://cloud.google.com/tpu/docs/bfloat16#choosing_bfloat16
+    # source: https://arxiv.org/pdf/1907.10701.pdf
+    "v2": 45e12,
+    # source: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_v3
+    "v3": 123e12,
+    # source: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_v4
+    "v4": 275e12,
+    # source: https://cloud.google.com/tpu/docs/v5e-training
+    "v5litepod": 197e12,
+}
+def get_flops_available(device: torch.device, dtype: torch.dtype) -> Optional[float]:
+    if device.type == "cuda":
+        device_name = torch.cuda.get_device_name(device).lower()
+        if "h100" in device_name and "hbm3" in device_name:
+            device_name = "h100-sxm"
+        elif "h100" in device_name and ("pcie" in device_name or "hbm2e" in device_name):
+            device_name = "h100-pcie"
+        elif "a100" in device_name:
+            device_name = "a100"
+        elif "a10g" in device_name:
+            device_name = "a10g"
+        elif "v100-sxm" in device_name:
+            device_name = "v100-sxm"
+        elif "v100-pcie" in device_name:
+            device_name = "v100-pcie"
+        elif "t4" in device_name:
+            device_name = "t4"
+        elif "quadro rtx 5000" in device_name:
+            device_name = "quadro rtx 5000"
+        else:
+            device_name = None
+        if device_name is not None:
+            try:
+                return int(GPU_AVAILABLE_FLOPS[device_name][dtype])
+            except KeyError:
+                raise KeyError(
+                    f"flop count not found for {device_name} with dtype: {dtype}; "
+                    "MFU cannot be calculated and reported."
+                )
+    elif device.type == "xla":
+        if _XLA_GREATER_EQUAL_2_1:
+            from torch_xla._internal import tpu
+        else:
+            from torch_xla.experimental import tpu
+        device_name = tpu.get_tpu_env()["TYPE"].lower()
+        try:
+            return int(TPU_AVAILABLE_FLOPS[device_name])
+        except KeyError:
+            raise KeyError(
+                f"flop count not found for {device_name} with dtype: {dtype}; MFU cannot be calculated and reported."
+            )
+    return None
+# Adapted from https://github.com/mosaicml/composer/blob/f2a2dc820cb75023b9eb7c46fdfd25273712abd0/composer/callbacks/speed_monitor.py
+class SpeedMonitorBase:
+    """Logs the training throughput and utilization.
+    +-------------------------------------+-----------------------------------------------------------+
+    | Key                                 | Logged data                                               |
+    +=====================================+===========================================================+
+    |                                     | Rolling average (over `window_size` most recent           |
+    | `throughput/batches_per_sec`        | batches) of the number of batches processed per second    |
+    |                                     |                                                           |
+    +-------------------------------------+-----------------------------------------------------------+
+    |                                     | Rolling average (over `window_size` most recent           |
+    | `throughput/samples_per_sec`        | batches) of the number of samples processed per second    |
+    |                                     |                                                           |
+    +-------------------------------------+-----------------------------------------------------------+
+    |                                     | Rolling average (over `window_size` most recent           |
+    | `throughput/tokens_per_sec`         | batches) of the number of tokens processed per second.    |
+    |                                     | This may include padding depending on dataset             |
+    +-------------------------------------+-----------------------------------------------------------+
+    |                                     | Estimates flops by `flops_per_batch * batches_per_sec`    |
+    | `throughput/flops_per_sec`          |                                                           |
+    |                                     |                                                           |
+    +-------------------------------------+-----------------------------------------------------------+
+    | `throughput/device/batches_per_sec` | `throughput/batches_per_sec` divided by world size        |
+    +-------------------------------------+-----------------------------------------------------------+
+    | `throughput/device/samples_per_sec` | `throughput/samples_per_sec` divided by world size        |
+    +-------------------------------------+-----------------------------------------------------------+
+    |                                     | `throughput/tokens_per_sec` divided by world size. This   |
+    | `throughput/device/tokens_per_sec`  | may include pad tokens depending on dataset               |
+    |                                     |                                                           |
+    +-------------------------------------+-----------------------------------------------------------+
+    |                                     | `throughput/flops_per_sec` divided by world size. Only    |
+    | `throughput/device/flops_per_sec`   | logged when model has attribute `flops_per_batch`         |
+    |                                     |                                                           |
+    +-------------------------------------+-----------------------------------------------------------+
+    |                                     | `throughput/device/flops_per_sec` divided by world size.  |
+    | `throughput/device/mfu`             |                                                           |
+    |                                     |                                                           |
+    +-------------------------------------+-----------------------------------------------------------+
+    | `time/train`                        | Total elapsed training time                               |
+    +-------------------------------------+-----------------------------------------------------------+
+    | `time/val`                          | Total elapsed validation time                             |
+    +-------------------------------------+-----------------------------------------------------------+
+    | `time/total`                        | Total elapsed time (time/train + time/val)                |
+    +-------------------------------------+-----------------------------------------------------------+
+    Notes:
+        - The implementation assumes that devices are homogeneous as it normalizes by the world size.
+        - Tokens/sec, flops/sec and MFU do not account for padding tokens if present. We suggest using samples/sec or
+          batches/sec to measure throughput under this circumstance.
+        - Be careful when comparing MFU numbers across projects, as this will highly depend on the ``flops_per_batch``.
+          There is no widespread, realistic, and reliable implementation to compute them.
+          We suggest using our ``measure_flops`` function, but many other works will use ``estimated_flops`` which
+          will almost always be an overestimate when compared to the true value.
+    Args:
+        window_size (int, optional): Number of batches to use for a rolling average of throughput.
+            Defaults to 100.
+        time_unit (str, optional): Time unit to use for `time` logging. Can be one of
+            'seconds', 'minutes', 'hours', or 'days'. Defaults to 'hours'.
+    """
+    def __init__(
+        self,
+        flops_available: float,
+        log_dict: Callable[[Dict, int], None],
+        window_size: int = 100,
+        time_unit: str = "hours",
+    ):
+        self.flops_available = flops_available
+        self.log_dict = log_dict
+        # Track the batch num samples and wct to compute throughput over a window of batches
+        self.history_samples: Deque[int] = deque(maxlen=window_size + 1)
+        self.history_wct: Deque[float] = deque(maxlen=window_size + 1)
+        self.history_lengths: Deque[int] = deque(maxlen=window_size + 1)
+        self.history_flops: Deque[int] = deque(maxlen=window_size + 1)
+        self.divider = 1
+        if time_unit == "seconds":
+            self.divider = 1
+        elif time_unit == "minutes":
+            self.divider = 60
+        elif time_unit == "hours":
+            self.divider = 60 * 60
+        elif time_unit == "days":
+            self.divider = 60 * 60 * 24
+        else:
+            raise ValueError(
+                f'Invalid time_unit: {time_unit}. Must be one of "seconds", "minutes", "hours", or "days".'
+            )
+        # Keep track of time spent evaluating
+        self.total_eval_wct = 0.0
+        self.step = -1
+    def on_train_batch_end(
+        self,
+        samples: int,  # total samples seen (per device)
+        train_elapsed: float,  # total training time (seconds)
+        world_size: int,
+        flops_per_batch: Optional[int] = None,  # (per device)
+        lengths: Optional[int] = None,  # total length of the samples seen (per device)
+    ) -> None:
+        self.step += 1
+        step = self.step
+        metrics = {}
+        self.history_samples.append(samples)
+        if lengths is not None:
+            self.history_lengths.append(lengths)
+            # if lengths are passed, there should be as many values as samples
+            assert len(self.history_samples) == len(self.history_lengths)
+        self.history_wct.append(train_elapsed)
+        if len(self.history_wct) == self.history_wct.maxlen:
+            elapsed_batches = len(self.history_samples) - 1
+            elapsed_samples = self.history_samples[-1] - self.history_samples[0]
+            elapsed_wct = self.history_wct[-1] - self.history_wct[0]
+            samples_per_sec = elapsed_samples * world_size / elapsed_wct
+            dev_samples_per_sec = elapsed_samples / elapsed_wct
+            metrics.update(
+                {
+                    "throughput/batches_per_sec": elapsed_batches * world_size / elapsed_wct,
+                    "throughput/samples_per_sec": samples_per_sec,
+                    "throughput/device/batches_per_sec": elapsed_batches / elapsed_wct,
+                    "throughput/device/samples_per_sec": dev_samples_per_sec,
+                }
+            )
+            if lengths is not None:
+                elapsed_lengths = int(self.history_lengths[-1]) - int(self.history_lengths[0])
+                avg_length = elapsed_lengths / elapsed_batches
+                metrics.update(
+                    {
+                        "throughput/tokens_per_sec": samples_per_sec * avg_length,
+                        "throughput/device/tokens_per_sec": dev_samples_per_sec * avg_length,
+                    }
+                )
+        if flops_per_batch is not None:
+            # sum of flops per batch across ranks
+            self.history_flops.append(flops_per_batch * world_size)
+        if len(self.history_flops) == self.history_flops.maxlen:
+            elapsed_flops = sum(self.history_flops) - self.history_flops[0]
+            elapsed_wct = self.history_wct[-1] - self.history_wct[0]
+            flops_per_sec = elapsed_flops / elapsed_wct
+            device_flops_per_sec = flops_per_sec / world_size
+            metrics.update(
+                {"throughput/flops_per_sec": flops_per_sec, "throughput/device/flops_per_sec": device_flops_per_sec}
+            )
+            if self.flops_available:
+                metrics["throughput/device/mfu"] = device_flops_per_sec / self.flops_available
+        metrics.update(
+            {
+                "time/train": train_elapsed / self.divider,
+                "time/val": self.total_eval_wct / self.divider,
+                "time/total": (train_elapsed + self.total_eval_wct) / self.divider,
+                "samples": samples,
+            }
+        )
+        self.log_dict(metrics, step)
+    def eval_end(self, eval_elapsed: float) -> None:
+        self.total_eval_wct += eval_elapsed  # seconds
+def plugin_to_compute_dtype(plugin: Precision) -> torch.dtype:
+    if isinstance(plugin, BitsandbytesPrecision):
+        return plugin.dtype
+    if isinstance(plugin, (HalfPrecision, MixedPrecision, HalfPrecisionPlugin)):
+        return plugin._desired_input_dtype
+    if isinstance(plugin, MixedPrecisionPlugin):
+        return torch.bfloat16 if plugin.precision == "bf16-mixed" else torch.half
+    if isinstance(plugin, (DoublePrecision, DoublePrecisionPlugin)):
+        return torch.double
+    if isinstance(plugin, (XLAPrecision, XLAPrecisionPlugin)):
+        return plugin._desired_dtype
+    if isinstance(plugin, TransformerEnginePrecision):
+        return torch.int8
+    if isinstance(plugin, (FSDPPrecision, FSDPPrecisionPlugin)):
+        return plugin.mixed_precision_config.reduce_dtype
+    if isinstance(plugin, Precision):
+        return torch.float32
+    raise NotImplementedError(plugin)
+class SpeedMonitorFabric(SpeedMonitorBase):
+    def __init__(self, fabric: Fabric, *args: Any, **kwargs: Any) -> None:
+        dtype = plugin_to_compute_dtype(fabric.strategy.precision)
+        flops_available = get_flops_available(fabric.device, dtype)
+        super().__init__(flops_available, fabric.log_dict, *args, **kwargs)
+    @fabric_rank_zero_only
+    def on_train_batch_end(self, *args: Any, **kwargs: Any) -> None:
+        super().on_train_batch_end(*args, **kwargs)
+class SpeedMonitorCallback(Callback):
+    def __init__(self, length_fn: Callable[[Any], int], batch_size: int, **kwargs: Any) -> None:
+        super().__init__()
+        self.speed_monitor: Optional[SpeedMonitorBase] = None
+        self.speed_monitor_kwargs = kwargs
+        self.length_fn = length_fn
+        self.batch_size = batch_size
+        self.eval_t0: int = 0
+        self.train_t0: int = 0
+        self.total_lengths: int = 0
+    def setup(self, trainer: Trainer, pl_module: LightningModule, stage: str) -> None:
+        if self.speed_monitor is not None:
+            return  # already setup
+        dtype = plugin_to_compute_dtype(trainer.precision_plugin)
+        flops_available = get_flops_available(trainer.strategy.root_device, dtype)
+        self.speed_monitor = SpeedMonitorBase(flops_available, trainer.logger.log_metrics, **self.speed_monitor_kwargs)
+    @trainer_rank_zero_only
+    def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+        if trainer.fit_loop._should_accumulate():
+            return
+        self.train_t0 = time.perf_counter()
+    @trainer_rank_zero_only
+    def on_train_batch_end(
+        self, trainer: Trainer, pl_module: LightningModule, outputs: Any, batch: Any, batch_idx: int
+    ) -> None:
+        self.total_lengths += self.length_fn(batch)
+        if trainer.fit_loop._should_accumulate():
+            return
+        train_elapsed = time.perf_counter() - self.train_t0
+        assert self.speed_monitor is not None
+        iter_num = trainer.fit_loop.total_batch_idx
+        assert (measured_flops := pl_module.measured_flops) is not None
+        self.speed_monitor.on_train_batch_end(
+            (iter_num + 1) * self.batch_size,
+            train_elapsed,
+            # this assumes that device FLOPs are the same and that all devices have the same batch size
+            trainer.world_size,
+            flops_per_batch=measured_flops,
+            lengths=self.total_lengths,
+        )
+    @trainer_rank_zero_only
+    def on_validation_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+        self.eval_t0 = time.perf_counter()
+    @trainer_rank_zero_only
+    def on_validation_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
+        eval_elapsed = time.perf_counter() - self.eval_t0
+        assert self.speed_monitor is not None
+        self.speed_monitor.eval_end(eval_elapsed)
+def flops_per_param(max_seq_length: int, n_layer: int, n_embd: int, n_params: int) -> int:
+    flops_per_token = 2 * n_params  # each parameter is used for a MAC (2 FLOPS) per network operation
+    # this assumes that all samples have a fixed length equal to the block size
+    # which is most likely false during finetuning
+    flops_per_seq = flops_per_token * max_seq_length
+    attn_flops_per_seq = n_layer * 2 * 2 * (n_embd * (max_seq_length**2))
+    return flops_per_seq + attn_flops_per_seq
+def estimate_flops(model: GPT) -> int:
+    """Measures estimated FLOPs for MFU.
+    Refs:
+        * https://ar5iv.labs.arxiv.org/html/2205.05198#A1
+        * https://ar5iv.labs.arxiv.org/html/2204.02311#A2
+    """
+    # using all parameters for this is a naive over estimation because not all model parameters actually contribute to
+    # this FLOP computation (e.g. embedding, norm). For this reason, the result will be higher by a fixed percentage
+    # (~10%) compared to the measured FLOPs, making those lower but more realistic.
+    # For a proper estimate, this needs a more fine-grained calculation as in Appendix A of the paper.
+    n_trainable_params = num_parameters(model, requires_grad=True)
+    trainable_flops = flops_per_param(
+        model.max_seq_length, model.config.n_layer, model.config.n_embd, n_trainable_params
+    )
+    # forward + backward + gradients (assumes no gradient accumulation)
+    ops_per_step = 3 if model.training else 1
+    n_frozen_params = num_parameters(model, requires_grad=False)
+    frozen_flops = flops_per_param(model.max_seq_length, model.config.n_layer, model.config.n_embd, n_frozen_params)
+    # forward + backward
+    frozen_ops_per_step = 2 if model.training else 1
+    return ops_per_step * trainable_flops + frozen_ops_per_step * frozen_flops
+def measure_flops(model: GPT, x: torch.Tensor) -> int:
+    """Measures real FLOPs for HFU"""
+    flop_counter = FlopCounterMode(model, display=False)
+    ctx = nullcontext() if model.training else torch.no_grad()
+    with ctx, flop_counter:
+        y = model(x)
+        if model.training:
+            y.sum().backward()
+    return flop_counter.get_total_flops()

tsai_gpt/tokenizer.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import json
+from pathlib import Path
+from typing import Optional
+import torch
+class Tokenizer:
+    def __init__(self, checkpoint_dir: Path) -> None:
+        self.use_bos = self.check_if_bos_token_used(checkpoint_dir)
+        self.bos_id = None
+        self.eos_id = None
+        # some checkpoints have both files, `.model` takes precedence
+        if (vocabulary_path := checkpoint_dir / "tokenizer.model").is_file():
+            from sentencepiece import SentencePieceProcessor
+            self.processor = SentencePieceProcessor(model_file=str(vocabulary_path))
+            self.backend = "sentencepiece"
+            self.bos_id = self.processor.bos_id()
+            self.eos_id = self.processor.eos_id()
+        elif (vocabulary_path := checkpoint_dir / "tokenizer.json").is_file():
+            from tokenizers import Tokenizer as HFTokenizer
+            self.processor = HFTokenizer.from_file(str(vocabulary_path))
+            self.backend = "huggingface"
+            if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
+                with open(special_tokens_path) as fp:
+                    config = json.load(fp)
+                bos_token = config.get("bos_token")
+                self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None
+                eos_token = config.get("eos_token")
+                self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None
+            if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
+                with open(special_tokens_path) as fp:
+                    config = json.load(fp)
+                if self.bos_id is None:
+                    self.bos_id = config.get("bos_token_id")
+                if self.eos_id is None:
+                    self.eos_id = config.get("eos_token_id")
+        else:
+            raise NotImplementedError
+    @property
+    def vocab_size(self) -> int:
+        if self.backend == "huggingface":
+            return self.processor.get_vocab_size(with_added_tokens=False)
+        if self.backend == "sentencepiece":
+            return self.processor.vocab_size()
+        raise RuntimeError
+    def token_to_id(self, token: str) -> int:
+        if self.backend == "huggingface":
+            id_ = self.processor.token_to_id(token)
+        elif self.backend == "sentencepiece":
+            id_ = self.processor.piece_to_id(token)
+        else:
+            raise RuntimeError
+        if id_ is None:
+            raise ValueError(f"token {token!r} not found in the collection.")
+        return id_
+    def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
+        if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file():
+            return False
+        with open(tokenizer_config_path) as fp:
+            config = json.load(fp)
+        if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
+            return True
+        # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True.
+        # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2
+        return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer"
+    def encode(
+        self,
+        string: str,
+        device: Optional[torch.device] = None,
+        bos: Optional[bool] = None,
+        eos: bool = False,
+        max_length: int = -1,
+    ) -> torch.Tensor:
+        if self.backend == "huggingface":
+            tokens = self.processor.encode(string).ids
+        elif self.backend == "sentencepiece":
+            tokens = self.processor.encode(string)
+        else:
+            raise RuntimeError
+        if bos or (bos is None and self.use_bos):
+            bos_id = self.bos_id
+            if bos_id is None:
+                raise NotImplementedError("This tokenizer does not have a defined a bos token")
+            tokens = [bos_id] + tokens
+        if eos:
+            tokens = tokens + [self.eos_id]
+        if max_length > 0:
+            tokens = tokens[:max_length]
+        return torch.tensor(tokens, dtype=torch.int, device=device)
+    def decode(self, tensor: torch.Tensor) -> str:
+        tokens = [tensor.item()] if tensor.ndim == 0 else tensor.tolist()
+        return self.processor.decode(tokens)

tsai_gpt/utils.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""Utility functions for training and inference."""
+import math
+import pickle
+import sys
+from contextlib import nullcontext
+from io import BytesIO
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    ContextManager,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    TypeVar,
+    Union,
+)
+import lightning as L
+import torch
+import torch.nn as nn
+import torch.utils._device
+from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities.load import _lazy_load as lazy_load
+from torch.serialization import normalize_storage_type
+if TYPE_CHECKING:
+    from model import GPT
+def find_multiple(n: int, k: int) -> int:
+    assert k > 0
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> int:
+    total = 0
+    for p in module.parameters():
+        if requires_grad is None or p.requires_grad == requires_grad:
+            if hasattr(p, "quant_state"):
+                # bitsandbytes 4bit layer support
+                total += math.prod(p.quant_state[1])
+            else:
+                total += p.numel()
+    return total
+def gptq_quantization(enabled: bool = False) -> ContextManager:
+    if not enabled:
+        return nullcontext()
+    from lightning.fabric.plugins.precision.utils import _ClassReplacementContextManager
+    from quantize.gptq import ColBlockQuantizedLinear
+    class QuantizedLinear(ColBlockQuantizedLinear):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, bits=4, tile_cols=-1, **kwargs)
+    return _ClassReplacementContextManager({"torch.nn.Linear": QuantizedLinear})
+def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None:
+    files = {
+        "lit_model.pth": (checkpoint_dir / "lit_model.pth").is_file(),
+        "lit_config.json": (checkpoint_dir / "lit_config.json").is_file(),
+        "tokenizer.json OR tokenizer.model": (
+            checkpoint_dir / "tokenizer.json"
+        ).is_file()
+        or (checkpoint_dir / "tokenizer.model").is_file(),
+        "tokenizer_config.json": (checkpoint_dir / "tokenizer_config.json").is_file(),
+    }
+    if checkpoint_dir.is_dir():
+        if all(files.values()):
+            # we're good
+            return
+        problem = f" is missing the files: {[f for f, exists in files.items() if not exists]!r}"
+    else:
+        problem = " is not a checkpoint directory"
+    # list locally available checkpoints
+    available = list(Path("checkpoints").glob("*/*"))
+    if available:
+        options = "\n --checkpoint_dir ".join(
+            [""] + [repr(str(p.resolve())) for p in available]
+        )
+        extra = f"\nYou have downloaded locally:{options}\n"
+    else:
+        extra = ""
+    error_message = (
+        f"--checkpoint_dir {str(checkpoint_dir.absolute())!r}{problem}."
+        "\nFind download instructions at https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials\n"
+        f"{extra}\nSee all download options by running:\n python scripts/download.py"
+    )
+    print(error_message, file=sys.stderr)
+    raise SystemExit(1)
+class SavingProxyForStorage:
+    def __init__(self, obj, saver, protocol_version=5):
+        self.protocol_version = protocol_version
+        self.saver = saver
+        if not (isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj)):
+            raise TypeError(f"expected storage, not {type(obj)}")
+        # this logic is taken from PyTorch 2.0+ torch/serialization.py
+        if isinstance(obj, torch.storage.TypedStorage):
+            # PT upstream wants to deprecate this eventually...
+            storage = obj._untyped_storage
+            storage_type_str = obj._pickle_storage_type()
+            storage_type = getattr(torch, storage_type_str)
+            storage_numel = obj._size()
+        else:
+            storage = obj
+            storage_type = normalize_storage_type(type(obj))
+            storage_numel = storage.nbytes()
+        storage_key = saver._write_storage_and_return_key(storage)
+        location = torch.serialization.location_tag(storage)
+        self.storage_info = (
+            "storage",
+            storage_type,
+            storage_key,
+            location,
+            storage_numel,
+        )
+    def __reduce_ex__(self, protocol_version):
+        assert False, "this should be handled with out of band"
+class SavingProxyForTensor:
+    def __init__(self, tensor, saver, protocol_version=5):
+        self.protocol_version = protocol_version
+        self.reduce_ret_fn, reduce_args = tensor.__reduce_ex__(protocol_version)
+        if reduce_args[0] == torch._utils._rebuild_tensor_v2:
+            # for Tensors with Python attributes
+            (a0, a1, (storage, *a2_other), *other_reduce_args) = reduce_args
+            assert isinstance(
+                storage, torch.storage.TypedStorage
+            ), "Please check for updates"
+            storage_proxy = SavingProxyForStorage(
+                storage, saver, protocol_version=protocol_version
+            )
+            self.reduce_args = (a0, a1, (storage_proxy, *a2_other), *other_reduce_args)
+        else:
+            (storage, *other_reduce_args) = reduce_args
+            assert isinstance(
+                storage, torch.storage.TypedStorage
+            ), "Please check for updates"
+            storage_proxy = SavingProxyForStorage(
+                storage, saver, protocol_version=protocol_version
+            )
+            self.reduce_args = (storage_proxy, *other_reduce_args)
+    def __reduce_ex__(self, protocol_version):
+        if protocol_version != self.protocol_version:
+            raise RuntimeError(
+                f"Unexpected protocol version: expected {self.protocol_version}, got {protocol_version}"
+            )
+        return self.reduce_ret_fn, self.reduce_args
+class IncrementalPyTorchPickler(pickle.Pickler):
+    def __init__(self, saver, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.storage_dtypes = {}
+        self.saver = saver
+        self.id_map = {}
+    # this logic is taken from PyTorch 2.0+ torch/serialization.py
+    def persistent_id(self, obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, SavingProxyForStorage):
+            return obj.storage_info
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage_numel = obj._size()
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                storage_numel = storage.nbytes()
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in self.storage_dtypes:
+                    if storage_dtype != self.storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that view the same data as different types"
+                        )
+                else:
+                    self.storage_dtypes[storage.data_ptr()] = storage_dtype
+            storage_key = self.id_map.get(storage._cdata)
+            if storage_key is None:
+                storage_key = self.saver._write_storage_and_return_key(storage)
+                self.id_map[storage._cdata] = storage_key
+            location = torch.serialization.location_tag(storage)
+            return ("storage", storage_type, storage_key, location, storage_numel)
+        return None
+class incremental_save:
+    def __init__(self, name):
+        self.name = name
+        self.zipfile = torch._C.PyTorchFileWriter(str(name))
+        self.has_saved = False
+        self.next_key = 0
+    def __enter__(self):
+        return self
+    def store_early(self, tensor):
+        if isinstance(tensor, torch.Tensor):
+            return SavingProxyForTensor(tensor, self)
+        raise TypeError(f"can only store tensors early, not {type(tensor)}")
+    def save(self, obj):
+        if self.has_saved:
+            raise RuntimeError("have already saved")
+        # Write the pickle data for `obj`
+        data_buf = BytesIO()
+        pickler = IncrementalPyTorchPickler(self, data_buf, protocol=5)
+        pickler.dump(obj)
+        data_value = data_buf.getvalue()
+        self.zipfile.write_record("data.pkl", data_value, len(data_value))
+        self.has_saved = True
+    def _write_storage_and_return_key(self, storage):
+        if self.has_saved:
+            raise RuntimeError("have already saved")
+        key = self.next_key
+        self.next_key += 1
+        name = f"data/{key}"
+        if storage.device.type != "cpu":
+            storage = storage.cpu()
+        num_bytes = storage.nbytes()
+        self.zipfile.write_record(name, storage.data_ptr(), num_bytes)
+        return key
+    def __exit__(self, type, value, traceback):
+        self.zipfile.write_end_of_file()
+T = TypeVar("T")
+def chunked_cross_entropy(
+    logits: Union[torch.Tensor, List[torch.Tensor]],
+    targets: torch.Tensor,
+    chunk_size: int = 128,
+) -> torch.Tensor:
+    # with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate
+    # the memory usage in fine-tuning settings with low number of parameters.
+    # as a workaround hack, the cross entropy computation is chunked to force it to deallocate on the go, reducing
+    # the memory spike's magnitude
+    # lm_head was chunked (we are fine-tuning)
+    if isinstance(logits, list):
+        # don't want to chunk cross entropy
+        if chunk_size == 0:
+            logits = torch.cat(logits, dim=1)
+            logits = logits.reshape(-1, logits.size(-1))
+            targets = targets.reshape(-1)
+            return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)
+        # chunk cross entropy
+        logit_chunks = [
+            logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits
+        ]
+        target_chunks = [
+            target_chunk.reshape(-1)
+            for target_chunk in targets.split(logits[0].size(1), dim=1)
+        ]
+        loss_chunks = [
+            torch.nn.functional.cross_entropy(
+                logit_chunk, target_chunk, ignore_index=-1, reduction="none"
+            )
+            for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
+        ]
+        return torch.cat(loss_chunks).mean()
+    # no chunking at all
+    logits = logits.reshape(-1, logits.size(-1))
+    targets = targets.reshape(-1)
+    if chunk_size == 0:
+        return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)
+    # lm_head wasn't chunked, chunk cross entropy
+    logit_chunks = logits.split(chunk_size)
+    target_chunks = targets.split(chunk_size)
+    loss_chunks = [
+        torch.nn.functional.cross_entropy(
+            logit_chunk, target_chunk, ignore_index=-1, reduction="none"
+        )
+        for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
+    ]
+    return torch.cat(loss_chunks).mean()
+def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict:
+    for checkpoint_name, attribute_name in mapping.items():
+        full_checkpoint_name = prefix + checkpoint_name
+        if full_checkpoint_name in state_dict:
+            full_attribute_name = prefix + attribute_name
+            state_dict[full_attribute_name] = state_dict.pop(full_checkpoint_name)
+    return state_dict
+def get_default_supported_precision(training: bool) -> str:
+    """Return default precision that is supported by the hardware: either `bf16` or `16`.
+    Args:
+        training: `-mixed` or `-true` version of the precision to use
+    Returns:
+        default precision that is suitable for the task and is supported by the hardware
+    """
+    from lightning.fabric.accelerators import MPSAccelerator
+    if MPSAccelerator.is_available() or (
+        torch.cuda.is_available() and not torch.cuda.is_bf16_supported()
+    ):
+        return "16-mixed" if training else "16-true"
+    return "bf16-mixed" if training else "bf16-true"
+def load_checkpoint(
+    fabric: L.Fabric, model: nn.Module, checkpoint_path: Path, strict: bool = True
+) -> None:
+    if isinstance(fabric.strategy, FSDPStrategy):
+        fabric.load_raw(checkpoint_path, model, strict=strict)
+    else:
+        state_dict = lazy_load(checkpoint_path)
+        state_dict = state_dict.get("model", state_dict)
+        model.load_state_dict(state_dict, strict=strict)
+def flops_per_param(
+    max_seq_length: int, n_layer: int, n_embd: int, n_params: int
+) -> int:
+    flops_per_token = (
+        2 * n_params
+    )  # each parameter is used for a MAC (2 FLOPS) per network operation
+    # this assumes that all samples have a fixed length equal to the block size
+    # which is most likely false during finetuning
+    flops_per_seq = flops_per_token * max_seq_length
+    attn_flops_per_seq = n_layer * 2 * 2 * (n_embd * (max_seq_length**2))
+    return flops_per_seq + attn_flops_per_seq
+def estimate_flops(model: "GPT", training: bool) -> int:
+    """Measures estimated FLOPs for MFU.
+    Refs:
+        * https://ar5iv.labs.arxiv.org/html/2205.05198#A1
+        * https://ar5iv.labs.arxiv.org/html/2204.02311#A2
+    """
+    # using all parameters for this is a naive over estimation because not all model parameters actually contribute to
+    # this FLOP computation (e.g. embedding, norm). For this reason, the result will be higher by a fixed percentage
+    # (~10%) compared to the measured FLOPs, making those lower but more realistic.
+    # For a proper estimate, this needs a more fine-grained calculation as in Appendix A of the paper.
+    n_trainable_params = num_parameters(model, requires_grad=True)
+    trainable_flops = flops_per_param(
+        model.max_seq_length,
+        model.config.n_layer,
+        model.config.n_embd,
+        n_trainable_params,
+    )
+    # forward + backward + gradients (assumes no gradient accumulation)
+    ops_per_step = 3 if training else 1
+    n_frozen_params = num_parameters(model, requires_grad=False)
+    frozen_flops = flops_per_param(
+        model.max_seq_length, model.config.n_layer, model.config.n_embd, n_frozen_params
+    )
+    # forward + backward
+    frozen_ops_per_step = 2 if training else 1
+    return ops_per_step * trainable_flops + frozen_ops_per_step * frozen_flops