Spaces:
Sleeping
Sleeping
import json | |
from copy import deepcopy | |
from dataclasses import dataclass, field | |
from pathlib import Path | |
from typing import Any, Literal, Optional, Type, Union | |
import torch | |
from typing_extensions import Self | |
import lit_gpt.model | |
from lit_gpt.utils import find_multiple | |
class Config: | |
name: str = "" | |
hf_config: dict = field(default_factory=dict) | |
block_size: int = 4096 | |
vocab_size: int = 50254 | |
padding_multiple: int = 512 | |
padded_vocab_size: Optional[int] = None | |
n_layer: int = 16 | |
n_head: int = 32 | |
n_embd: int = 4096 | |
rotary_percentage: float = 0.25 | |
parallel_residual: bool = True | |
bias: bool = True | |
lm_head_bias: bool = False | |
# to use multi-head attention (MHA), set this to `n_head` (default) | |
# to use multi-query attention (MQA), set this to 1 | |
# to use grouped-query attention (GQA), set this to a value in between | |
# Example with `n_head=4` | |
# ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐ | |
# │ v ││ v ││ v ││ v │ │ v │ │ v │ │ v │ | |
# └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘ | |
# │ │ │ │ │ │ │ | |
# ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐ | |
# │ k ││ k ││ k ││ k │ │ k │ │ k │ │ k │ | |
# └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘ | |
# │ │ │ │ ┌──┴──┐ ┌──┴──┐ ┌────┬──┴─┬────┐ | |
# ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐ | |
# │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │ | |
# └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘ | |
# ◀──────────────────▶ ◀──────────────────▶ ◀──────────────────▶ | |
# MHA GQA MQA | |
# n_query_groups=4 n_query_groups=2 n_query_groups=1 | |
# | |
# credit https://arxiv.org/pdf/2305.13245.pdf | |
n_query_groups: Optional[int] = None | |
shared_attention_norm: bool = False | |
_norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm" | |
norm_eps: float = 1e-5 | |
_mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP" | |
gelu_approximate: str = "none" | |
intermediate_size: Optional[int] = None | |
rope_condense_ratio: int = 1 | |
rope_base: int = 10000 | |
def __post_init__(self): | |
if not self.name: | |
self.name = self.hf_config.get("name", self.name) | |
assert self.n_embd % self.n_head == 0 | |
self.head_size = self.n_embd // self.n_head | |
# vocab size should be a power of 2 to be optimal on hardware. compute the closest value | |
if self.padded_vocab_size is None: | |
self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple) | |
else: | |
# vocab size shouldn't be larger than padded vocab size | |
self.vocab_size = min(self.vocab_size, self.padded_vocab_size) | |
# compute the number of query groups | |
if self.n_query_groups is not None: | |
assert self.n_head % self.n_query_groups == 0 | |
else: | |
self.n_query_groups = self.n_head | |
# compute the intermediate size for MLP if not set | |
if self.intermediate_size is None: | |
if self._mlp_class == "LLaMAMLP": | |
raise ValueError("The config needs to set the `intermediate_size`") | |
self.intermediate_size = 4 * self.n_embd | |
self.rope_n_elem = int(self.rotary_percentage * self.head_size) | |
def from_name(cls, name: str, **kwargs: Any) -> Self: | |
if name not in name_to_config: | |
# search through all `config['hf_config']['name']` | |
try: | |
conf_dict = next(config for config in configs if name == config["hf_config"]["name"]) | |
except StopIteration: | |
raise ValueError(f"{name!r} is not a supported config name") | |
else: | |
conf_dict = name_to_config[name] | |
conf_dict = conf_dict.copy() | |
if "condense_ratio" in kwargs: # legacy name | |
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") | |
conf_dict.update(kwargs) | |
return cls(**conf_dict) | |
def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self: | |
with open(path, encoding="utf-8") as fp: | |
json_kwargs = json.load(fp) | |
if "condense_ratio" in json_kwargs: # legacy name | |
json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio") | |
if "condense_ratio" in kwargs: # legacy name | |
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") | |
if "org" in json_kwargs: # legacy name | |
json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")} | |
if "org" in kwargs: # legacy name | |
kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")} | |
json_kwargs.update(kwargs) | |
return cls(**json_kwargs) | |
def from_checkpoint(cls, path: Path, **kwargs: Any) -> Self: | |
"""Automatically load `lit_config.json` and if it doesn't exist - a matching config from `lit_gpt/config.py`.""" | |
if (config_path := path / "lit_config.json").is_file(): | |
return cls.from_json(config_path, **kwargs) | |
if (model_name := path.name) in name_to_config: | |
return cls.from_name(model_name, **kwargs) | |
raise FileNotFoundError(f"For {str(path)!r} neither 'lit_config.json' nor matching config exists.") | |
def mlp_class(self) -> Type: | |
# `self._mlp_class` cannot be the type to keep the config json serializable | |
return getattr(lit_gpt.model, self._mlp_class) | |
def norm_class(self) -> Type: | |
# `self._norm_class` cannot be the type to keep the config json serializable | |
if self._norm_class == "RMSNorm": | |
from lit_gpt.rmsnorm import RMSNorm | |
return RMSNorm | |
return getattr(torch.nn, self._norm_class) | |
######################## | |
# Stability AI StableLM | |
######################## | |
configs = [ | |
# https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json | |
dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")), | |
# https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json | |
dict( | |
name="stablelm-base-alpha-7b", | |
hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"), | |
n_head=48, | |
n_embd=6144, | |
padding_multiple=256, | |
), | |
# https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json | |
dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32), | |
# https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json | |
dict( | |
name="stablelm-tuned-alpha-7b", | |
hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"), | |
n_head=48, | |
n_embd=6144, | |
padding_multiple=256, | |
), | |
] | |
#################### | |
# EleutherAI Pythia | |
#################### | |
pythia = [ | |
# https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json | |
dict( | |
name="pythia-14m", | |
hf_config=dict(org="EleutherAI", name="pythia-14m"), | |
block_size=512, | |
n_layer=6, | |
n_embd=128, | |
n_head=4, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json | |
dict( | |
name="pythia-31m", | |
hf_config=dict(org="EleutherAI", name="pythia-31m"), | |
block_size=1024, | |
n_layer=6, | |
n_embd=256, | |
n_head=8, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json | |
dict( | |
name="pythia-70m", | |
hf_config=dict(org="EleutherAI", name="pythia-70m"), | |
block_size=2048, | |
n_layer=6, | |
n_embd=512, | |
n_head=8, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json | |
dict( | |
name="pythia-160m", | |
hf_config=dict(org="EleutherAI", name="pythia-160m"), | |
block_size=2048, | |
n_layer=12, | |
n_embd=768, | |
n_head=12, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json | |
dict( | |
name="pythia-410m", | |
hf_config=dict(org="EleutherAI", name="pythia-410m"), | |
block_size=2048, | |
n_layer=24, | |
n_embd=1024, | |
n_head=16, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json | |
dict( | |
name="pythia-1b", | |
hf_config=dict(org="EleutherAI", name="pythia-1b"), | |
block_size=2048, | |
n_embd=2048, | |
n_head=8, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json | |
dict( | |
name="pythia-1.4b", | |
hf_config=dict(org="EleutherAI", name="pythia-1.4b"), | |
block_size=2048, | |
n_layer=24, | |
n_embd=2048, | |
n_head=16, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json | |
dict( | |
name="pythia-2.8b", | |
hf_config=dict(org="EleutherAI", name="pythia-2.8b"), | |
block_size=2048, | |
n_layer=32, | |
n_embd=2560, | |
padding_multiple=128, | |
), | |
# https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json | |
dict( | |
name="pythia-6.9b", | |
hf_config=dict(org="EleutherAI", name="pythia-6.9b"), | |
block_size=2048, | |
n_layer=32, | |
padding_multiple=256, | |
), | |
# https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json | |
dict( | |
name="pythia-12b", | |
hf_config=dict(org="EleutherAI", name="pythia-12b"), | |
block_size=2048, | |
n_layer=36, | |
n_embd=5120, | |
n_head=40, | |
), | |
] | |
configs.extend(pythia) | |
for c in pythia: | |
# "pythia-14m" and "pythia-31m" don't have deduped version | |
if c["name"] in ("pythia-14m", "pythia-31m"): | |
continue | |
copy = deepcopy(c) | |
copy["name"] = f"{c['name']}-deduped" | |
copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped" | |
configs.append(copy) | |
#################################### | |
# togethercomputer RedPajama INCITE | |
#################################### | |
redpajama_incite = [ | |
# https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json | |
dict( | |
name="RedPajama-INCITE-{}-3B-v1", | |
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"), | |
block_size=2048, | |
n_layer=32, | |
n_embd=2560, | |
padding_multiple=256, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
), | |
# https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json | |
dict( | |
name="RedPajama-INCITE-7B-{}", | |
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"), | |
block_size=2048, | |
n_layer=32, | |
padding_multiple=256, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
), | |
# this redirects to the checkpoint above. kept for those who had the old weights already downloaded | |
dict( | |
name="RedPajama-INCITE-{}-7B-v0.1", | |
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"), | |
block_size=2048, | |
n_layer=32, | |
padding_multiple=256, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
), | |
] | |
for c in redpajama_incite: | |
for kind in ("Base", "Chat", "Instruct"): | |
copy = deepcopy(c) | |
copy["name"] = c["name"].format(kind) | |
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) | |
configs.append(copy) | |
################# | |
# TII UAE Falcon | |
################# | |
falcon = [ | |
# https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json | |
dict( | |
name="falcon-7b{}", | |
hf_config=dict(org="tiiuae", name="falcon-7b{}"), | |
block_size=2048, | |
vocab_size=65024, | |
padded_vocab_size=65024, | |
n_layer=32, | |
n_head=71, | |
n_embd=4544, | |
rotary_percentage=1.0, | |
n_query_groups=1, | |
bias=False, | |
# this is not in the config, but in the original model implementation, only for this config | |
shared_attention_norm=True, | |
), | |
# https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json | |
dict( | |
name="falcon-40b{}", | |
hf_config=dict(org="tiiuae", name="falcon-40b{}"), | |
block_size=2048, | |
vocab_size=65024, | |
padded_vocab_size=65024, | |
n_layer=60, | |
n_head=128, | |
n_embd=8192, | |
rotary_percentage=1.0, | |
n_query_groups=8, | |
bias=False, | |
), | |
] | |
for c in falcon: | |
for kind in ("", "-instruct"): | |
copy = deepcopy(c) | |
copy["name"] = c["name"].format(kind) | |
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) | |
configs.append(copy) | |
# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json | |
falcon180b = dict( | |
name="falcon-180B{}", | |
hf_config=dict(org="tiiuae", name="falcon-180B{}"), | |
block_size=2048, | |
vocab_size=65024, | |
padded_vocab_size=65024, | |
n_layer=80, | |
n_head=232, | |
n_embd=14848, | |
rotary_percentage=1.0, | |
n_query_groups=8, | |
bias=False, | |
) | |
for kind in ("", "-chat"): | |
copy = deepcopy(falcon180b) | |
copy["name"] = falcon180b["name"].format(kind) | |
copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind) | |
configs.append(copy) | |
############################# | |
# OpenLM Research Open LLaMA | |
############################# | |
open_LLaMA = [ | |
# https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json | |
dict( | |
name="open_llama_3b", | |
hf_config=dict(org="openlm-research", name="open_llama_3b"), | |
block_size=2048, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=26, | |
n_embd=3200, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=8640, | |
), | |
# https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json | |
dict( | |
name="open_llama_7b", | |
hf_config=dict(org="openlm-research", name="open_llama_7b"), | |
block_size=2048, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
), | |
# https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json | |
dict( | |
name="open_llama_13b", | |
hf_config=dict(org="openlm-research", name="open_llama_13b"), | |
block_size=2048, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
] | |
configs.extend(open_LLaMA) | |
############### | |
# LMSYS Vicuna | |
############### | |
vicuna = [ | |
# https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json | |
dict( | |
name="vicuna-7b-v1.3", | |
hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"), | |
block_size=2048, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
), | |
# https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json | |
dict( | |
name="vicuna-13b-v1.3", | |
hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"), | |
block_size=2048, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
# https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json | |
dict( | |
name="vicuna-33b-v1.3", | |
hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"), | |
block_size=2048, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=60, | |
n_head=52, | |
n_embd=6656, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=17920, | |
), | |
# https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json | |
dict( | |
name="vicuna-7b-v1.5", | |
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"), | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
), | |
# https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json | |
dict( | |
name="vicuna-7b-v1.5-16k", | |
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
rope_condense_ratio=4, | |
), | |
# https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json | |
dict( | |
name="vicuna-13b-v1.5", | |
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"), | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
# https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json | |
dict( | |
name="vicuna-13b-v1.5-16k", | |
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
rope_condense_ratio=4, | |
), | |
] | |
configs.extend(vicuna) | |
################# | |
# LMSYS LongChat | |
################# | |
long_chat = [ | |
# https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json | |
dict( | |
name="longchat-7b-16k", | |
hf_config=dict(org="lmsys", name="longchat-7b-16k"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
rope_condense_ratio=8, | |
), | |
# https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json | |
dict( | |
name="longchat-13b-16k", | |
hf_config=dict(org="lmsys", name="longchat-13b-16k"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
rope_condense_ratio=8, | |
), | |
] | |
configs.extend(long_chat) | |
###################### | |
# NousResearch Hermes | |
###################### | |
nous_research = [ | |
# https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json | |
dict( | |
name="Nous-Hermes-llama-2-7b", | |
hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"), | |
padded_vocab_size=32000, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
), | |
# https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json | |
dict( | |
name="Nous-Hermes-13b", | |
hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"), | |
block_size=2048, | |
vocab_size=32000, | |
padded_vocab_size=32001, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-6, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
# https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b | |
dict( | |
name="Nous-Hermes-Llama2-13b", | |
hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"), | |
vocab_size=32000, | |
padded_vocab_size=32032, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
] | |
configs.extend(nous_research) | |
############### | |
# Meta LLaMA 2 | |
############### | |
llama_2 = [ | |
# https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json | |
dict( | |
name="Llama-2-7b{}-hf", | |
hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"), | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
), | |
# https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json | |
dict( | |
name="Llama-2-13b{}-hf", | |
hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"), | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
# https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json | |
dict( | |
name="Llama-2-70b{}-hf", | |
hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"), | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=80, | |
n_head=64, | |
n_embd=8192, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=28672, | |
), | |
] | |
for c in llama_2: | |
for kind in ("", "-chat"): | |
copy = deepcopy(c) | |
copy["name"] = c["name"].format(kind) | |
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) | |
configs.append(copy) | |
########################## | |
# Stability AI FreeWilly2 | |
########################## | |
freewilly_2 = [ | |
# https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json | |
dict( | |
name="FreeWilly2", | |
hf_config=dict(org="stabilityai", name="FreeWilly2"), | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=80, | |
n_head=64, | |
n_embd=8192, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=28672, | |
) | |
] | |
configs.extend(freewilly_2) | |
################## | |
# Meta Code Llama | |
################## | |
code_llama = [ | |
# https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-7b-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-7b-hf"), | |
block_size=16384, | |
vocab_size=32016, | |
padding_multiple=16, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-13b-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-13b-hf"), | |
block_size=16384, | |
vocab_size=32016, | |
padding_multiple=16, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-34b-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-34b-hf"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=48, | |
n_head=64, | |
n_embd=8192, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=22016, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-7b-Python-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-13b-Python-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-34b-Python-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=48, | |
n_head=64, | |
n_embd=8192, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=22016, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/tree/main/config.json | |
dict( | |
name="CodeLlama-7b-Instruct-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"), | |
block_size=16384, | |
vocab_size=32016, | |
padding_multiple=16, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-13b-Instruct-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"), | |
block_size=2048, | |
vocab_size=32016, | |
padding_multiple=16, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
rope_base=1000000, | |
), | |
# https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json | |
dict( | |
name="CodeLlama-34b-Instruct-hf", | |
hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"), | |
block_size=16384, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=48, | |
n_head=64, | |
n_embd=8192, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=22016, | |
rope_base=1000000, | |
), | |
] | |
configs.extend(code_llama) | |
######################## | |
# garage-bAInd Platypus | |
######################## | |
platypus = [ | |
# https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json | |
dict( | |
name="Platypus-30B", | |
hf_config=dict(org="garage-bAInd", name="Platypus-30B"), | |
block_size=2048, | |
padded_vocab_size=32000, | |
n_layer=60, | |
n_head=52, | |
n_embd=6656, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-06, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=17920, | |
), | |
# https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json | |
dict( | |
name="Platypus2-7B", | |
hf_config=dict(org="garage-bAInd", name="Platypus2-7B"), | |
padded_vocab_size=32000, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
), | |
# https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json | |
dict( | |
name="Platypus2-13B", | |
hf_config=dict(org="garage-bAInd", name="Platypus2-13B"), | |
padded_vocab_size=32000, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
# https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json | |
dict( | |
name="Platypus2-70B", | |
hf_config=dict(org="garage-bAInd", name="Platypus2-70B"), | |
padded_vocab_size=32000, | |
n_layer=80, | |
n_head=64, | |
n_embd=8192, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=28672, | |
), | |
# https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json | |
dict( | |
name="Camel-Platypus2-13B", | |
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"), | |
padded_vocab_size=32000, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
# https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json | |
dict( | |
name="Camel-Platypus2-70B", | |
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"), | |
padded_vocab_size=32000, | |
n_layer=80, | |
n_head=64, | |
n_embd=8192, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=28672, | |
), | |
# https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json | |
dict( | |
name="Stable-Platypus2-13B", | |
hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"), | |
padded_vocab_size=32000, | |
n_layer=40, | |
n_head=40, | |
n_embd=5120, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=13824, | |
), | |
# https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json | |
dict( | |
name="Platypus2-70B-instruct", | |
hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"), | |
padded_vocab_size=32000, | |
n_layer=80, | |
n_head=64, | |
n_embd=8192, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=28672, | |
), | |
] | |
configs.extend(platypus) | |
########################## | |
# Stability AI StableCode | |
########################## | |
stablecode = [ | |
# https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json | |
dict( | |
name="stablecode-completion-alpha-3b", | |
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), | |
block_size=16384, | |
vocab_size=49152, | |
n_layer=32, | |
n_embd=2560, | |
), | |
# https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json | |
dict( | |
name="stablecode-completion-alpha-3b-4k", | |
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), | |
vocab_size=49152, | |
n_layer=32, | |
n_embd=2560, | |
), | |
# https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json | |
dict( | |
name="stablecode-instruct-alpha-3b", | |
hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"), | |
vocab_size=49152, | |
n_layer=32, | |
n_embd=2560, | |
), | |
] | |
configs.extend(stablecode) | |
################################## | |
# togethercomputer LLaMA-2-7B-32K | |
################################## | |
together_llama2_32k = [ | |
# https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json | |
dict( | |
name="LLaMA-2-7B-32K", | |
hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"), | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=32, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
_mlp_class="LLaMAMLP", | |
intermediate_size=11008, | |
rope_condense_ratio=8, | |
) | |
] | |
configs.extend(together_llama2_32k) | |
################ | |
# Microsoft Phi | |
################ | |
phi = [ | |
# https://huggingface.co/microsoft/phi-1_5/blob/main/config.json | |
dict( | |
name="phi-1_5", | |
hf_config=dict(org="microsoft", name="phi-1_5"), | |
vocab_size=50257, | |
padded_vocab_size=51200, | |
block_size=2048, | |
n_embd=2048, | |
n_layer=24, | |
rotary_percentage=0.5, # 32 / (n_embd / n_head) = 32 / 64 | |
shared_attention_norm=True, | |
lm_head_bias=True, | |
gelu_approximate="tanh", | |
) | |
] | |
configs.extend(phi) | |
############# | |
# Mistral AI | |
############# | |
mistral = [ | |
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json | |
dict( | |
name="Mistral-7B-{}v0.1", | |
hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"), | |
padded_vocab_size=32000, | |
block_size=4096, # should be 32768 but sliding window attention is not implemented | |
n_layer=32, | |
n_query_groups=8, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", | |
norm_eps=1e-05, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=14336, | |
) | |
] | |
for c in mistral: | |
for kind in ("", "Instruct-"): | |
copy = deepcopy(c) | |
copy["name"] = c["name"].format(kind) | |
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) | |
configs.append(copy) | |
############ | |
# TinyLlama | |
############ | |
tiny_llama = [ | |
dict( | |
name="tiny-llama-1.1b{}", | |
hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"), | |
block_size=2048, | |
vocab_size=32000, | |
padding_multiple=64, | |
n_layer=22, | |
n_head=32, | |
n_embd=2048, | |
rotary_percentage=1.0, | |
parallel_residual=False, | |
bias=False, | |
_norm_class="RMSNorm", # original TinyLlama uses FusedRMSNorm | |
norm_eps=1e-5, | |
_mlp_class="LLaMAMLP", | |
intermediate_size=5632, | |
n_query_groups=4, | |
), | |
] | |
for c in tiny_llama: | |
for kind, hf_postfix in (("", "-intermediate-step-955k-token-2T"), ("chat", "-Chat-v0.6")): | |
copy = deepcopy(c) | |
copy["name"] = c["name"].format(kind) | |
copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix) | |
configs.append(copy) | |
name_to_config = {config["name"]: config for config in configs} | |