Spaces:

PrarthanaTS
/

tsai-gpt-from-scratch

Sleeping

App Files Files Community

tsai-gpt-from-scratch / tsai_gpt /config.py

PrarthanaTS

Upload 23 files

f9f4f08 about 1 year ago

raw

history blame contribute delete

37.1 kB

	import json
	from copy import deepcopy
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Any, Literal, Optional, Type, Union

	import torch
	from typing_extensions import Self

	import tsai_gpt.model
	from tsai_gpt.utils import find_multiple


	@dataclass
	class Config:
	name: str = ""
	hf_config: dict = field(default_factory=dict)
	block_size: int = 4096
	vocab_size: int = 50254
	padding_multiple: int = 512
	padded_vocab_size: Optional[int] = None
	n_layer: int = 16
	n_head: int = 32
	n_embd: int = 4096
	rotary_percentage: float = 0.25
	parallel_residual: bool = True
	bias: bool = True
	lm_head_bias: bool = False
	# to use multi-head attention (MHA), set this to `n_head` (default)
	# to use multi-query attention (MQA), set this to 1
	# to use grouped-query attention (GQA), set this to a value in between
	# Example with `n_head=4`
	# ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐
	# │ v ││ v ││ v ││ v │ │ v │ │ v │ │ v │
	# └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘
	# │ │ │ │ │ │ │
	# ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐
	# │ k ││ k ││ k ││ k │ │ k │ │ k │ │ k │
	# └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘
	# │ │ │ │ ┌──┴──┐ ┌──┴──┐ ┌────┬──┴─┬────┐
	# ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐
	# │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │
	# └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘
	# ◀──────────────────▶ ◀──────────────────▶ ◀──────────────────▶
	# MHA GQA MQA
	# n_query_groups=4 n_query_groups=2 n_query_groups=1
	#
	# credit https://arxiv.org/pdf/2305.13245.pdf
	n_query_groups: Optional[int] = None
	shared_attention_norm: bool = False
	_norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
	norm_eps: float = 1e-5
	_mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
	gelu_approximate: str = "none"
	intermediate_size: Optional[int] = None
	rope_condense_ratio: int = 1
	rope_base: int = 10000

	def __post_init__(self):
	if not self.name:
	self.name = self.hf_config.get("name", self.name)

	assert self.n_embd % self.n_head == 0
	self.head_size = self.n_embd // self.n_head

	# vocab size should be a power of 2 to be optimal on hardware. compute the closest value
	if self.padded_vocab_size is None:
	self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple)
	else:
	# vocab size shouldn't be larger than padded vocab size
	self.vocab_size = min(self.vocab_size, self.padded_vocab_size)

	# compute the number of query groups
	if self.n_query_groups is not None:
	assert self.n_head % self.n_query_groups == 0
	else:
	self.n_query_groups = self.n_head

	# compute the intermediate size for MLP if not set
	if self.intermediate_size is None:
	if self._mlp_class == "LLaMAMLP":
	raise ValueError("The config needs to set the `intermediate_size`")
	self.intermediate_size = 4 * self.n_embd

	self.rope_n_elem = int(self.rotary_percentage * self.head_size)

	@classmethod
	def from_name(cls, name: str, **kwargs: Any) -> Self:
	if name not in name_to_config:
	# search through all `config['hf_config']['name']`
	conf_dict = next(config for config in configs if name == config["hf_config"]["name"])
	else:
	conf_dict = name_to_config[name]

	conf_dict = conf_dict.copy()
	if "condense_ratio" in kwargs: # legacy name
	kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
	conf_dict.update(kwargs)
	return cls(**conf_dict)

	@classmethod
	def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self:
	with open(path, encoding="utf-8") as fp:
	json_kwargs = json.load(fp)
	if "condense_ratio" in json_kwargs: # legacy name
	json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio")
	if "condense_ratio" in kwargs: # legacy name
	kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
	if "org" in json_kwargs: # legacy name
	json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")}
	if "org" in kwargs: # legacy name
	kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")}
	json_kwargs.update(kwargs)
	return cls(**json_kwargs)

	@property
	def mlp_class(self) -> Type:
	# `self._mlp_class` cannot be the type to keep the config json serializable
	return getattr(tsai_gpt.model, self._mlp_class)

	@property
	def norm_class(self) -> Type:
	# `self._norm_class` cannot be the type to keep the config json serializable
	if self._norm_class == "RMSNorm":
	from tsai_gpt.rmsnorm import RMSNorm

	return RMSNorm
	return getattr(torch.nn, self._norm_class)


	########################
	# Stability AI StableLM
	########################
	configs = [
	# https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json
	dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
	# https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
	dict(
	name="stablelm-base-alpha-7b",
	hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
	n_head=48,
	n_embd=6144,
	padding_multiple=256,
	),
	# https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
	dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
	# https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
	dict(
	name="stablelm-tuned-alpha-7b",
	hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
	n_head=48,
	n_embd=6144,
	padding_multiple=256,
	),
	]

	####################
	# EleutherAI Pythia
	####################
	pythia = [
	# https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
	dict(
	name="pythia-70m",
	hf_config=dict(org="EleutherAI", name="pythia-70m"),
	block_size=2048,
	n_layer=6,
	n_embd=512,
	n_head=8,
	padding_multiple=128,
	),
	# https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
	dict(
	name="pythia-160m",
	hf_config=dict(org="EleutherAI", name="pythia-160m"),
	block_size=2048,
	n_layer=12,
	n_embd=768,
	n_head=12,
	padding_multiple=128,
	),
	# https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
	dict(
	name="pythia-410m",
	hf_config=dict(org="EleutherAI", name="pythia-410m"),
	block_size=2048,
	n_layer=24,
	n_embd=1024,
	n_head=16,
	padding_multiple=128,
	),
	# https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
	dict(
	name="pythia-1b",
	hf_config=dict(org="EleutherAI", name="pythia-1b"),
	block_size=2048,
	n_embd=2048,
	n_head=8,
	padding_multiple=128,
	),
	# https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
	dict(
	name="pythia-1.4b",
	hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
	block_size=2048,
	n_layer=24,
	n_embd=2048,
	n_head=16,
	padding_multiple=128,
	),
	# https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
	dict(
	name="pythia-2.8b",
	hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
	block_size=2048,
	n_layer=32,
	n_embd=2560,
	padding_multiple=128,
	),
	# https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
	dict(
	name="pythia-6.9b",
	hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
	block_size=2048,
	n_layer=32,
	padding_multiple=256,
	),
	# https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
	dict(
	name="pythia-12b",
	hf_config=dict(org="EleutherAI", name="pythia-12b"),
	block_size=2048,
	n_layer=36,
	n_embd=5120,
	n_head=40,
	),
	]
	configs.extend(pythia)
	for c in pythia:
	copy = c.copy()
	copy["name"] = f"{c['name']}-deduped"
	copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
	configs.append(copy)


	####################################
	# togethercomputer RedPajama INCITE
	####################################
	redpajama_incite = [
	# https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json
	dict(
	name="RedPajama-INCITE-{}-3B-v1",
	hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
	block_size=2048,
	n_layer=32,
	n_embd=2560,
	padding_multiple=256,
	rotary_percentage=1.0,
	parallel_residual=False,
	),
	# https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json
	dict(
	name="RedPajama-INCITE-7B-{}",
	hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
	block_size=2048,
	n_layer=32,
	padding_multiple=256,
	rotary_percentage=1.0,
	parallel_residual=False,
	),
	# this redirects to the checkpoint above. kept for those who had the old weights already downloaded
	dict(
	name="RedPajama-INCITE-{}-7B-v0.1",
	hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
	block_size=2048,
	n_layer=32,
	padding_multiple=256,
	rotary_percentage=1.0,
	parallel_residual=False,
	),
	]
	for c in redpajama_incite:
	for kind in ("Base", "Chat", "Instruct"):
	copy = c.copy()
	copy["name"] = c["name"].format(kind)
	copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
	configs.append(copy)


	#################
	# TII UAE Falcon
	#################
	falcon = [
	# https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
	dict(
	name="falcon-7b{}",
	hf_config=dict(org="tiiuae", name="falcon-7b{}"),
	block_size=2048,
	vocab_size=65024,
	padded_vocab_size=65024,
	n_layer=32,
	n_head=71,
	n_embd=4544,
	rotary_percentage=1.0,
	n_query_groups=1,
	bias=False,
	# this is not in the config, but in the original model implementation, only for this config
	shared_attention_norm=True,
	),
	# https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
	dict(
	name="falcon-40b{}",
	hf_config=dict(org="tiiuae", name="falcon-40b{}"),
	block_size=2048,
	vocab_size=65024,
	padded_vocab_size=65024,
	n_layer=60,
	n_head=128,
	n_embd=8192,
	rotary_percentage=1.0,
	n_query_groups=8,
	bias=False,
	),
	]
	for c in falcon:
	for kind in ("", "-instruct"):
	copy = c.copy()
	copy["name"] = c["name"].format(kind)
	copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
	configs.append(copy)

	# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
	falcon180b = dict(
	name="falcon-180B{}",
	hf_config=dict(org="tiiuae", name="falcon-180B{}"),
	block_size=2048,
	vocab_size=65024,
	padded_vocab_size=65024,
	n_layer=80,
	n_head=232,
	n_embd=14848,
	rotary_percentage=1.0,
	n_query_groups=8,
	bias=False,
	)

	for kind in ("", "-chat"):
	copy = falcon180b.copy()
	copy["name"] = falcon180b["name"].format(kind)
	copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
	configs.append(copy)


	#############################
	# OpenLM Research Open LLaMA
	#############################
	open_LLaMA = [
	# https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
	dict(
	name="open_llama_3b",
	hf_config=dict(org="openlm-research", name="open_llama_3b"),
	block_size=2048,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=26,
	n_embd=3200,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=8640,
	),
	# https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
	dict(
	name="open_llama_7b",
	hf_config=dict(org="openlm-research", name="open_llama_7b"),
	block_size=2048,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	),
	# https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
	dict(
	name="open_llama_13b",
	hf_config=dict(org="openlm-research", name="open_llama_13b"),
	block_size=2048,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	]
	configs.extend(open_LLaMA)


	###############
	# LMSYS Vicuna
	###############
	vicuna = [
	# https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json
	dict(
	name="vicuna-7b-v1.3",
	hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
	block_size=2048,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	),
	# https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json
	dict(
	name="vicuna-13b-v1.3",
	hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
	block_size=2048,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	# https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json
	dict(
	name="vicuna-33b-v1.3",
	hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
	block_size=2048,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=60,
	n_head=52,
	n_embd=6656,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=17920,
	),
	# https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
	dict(
	name="vicuna-7b-v1.5",
	hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	),
	# https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json
	dict(
	name="vicuna-7b-v1.5-16k",
	hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	rope_condense_ratio=4,
	),
	# https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json
	dict(
	name="vicuna-13b-v1.5",
	hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
	vocab_size=32000,
	padding_multiple=64,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	# https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json
	dict(
	name="vicuna-13b-v1.5-16k",
	hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	rope_condense_ratio=4,
	),
	]
	configs.extend(vicuna)


	#################
	# LMSYS LongChat
	#################
	long_chat = [
	# https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json
	dict(
	name="longchat-7b-16k",
	hf_config=dict(org="lmsys", name="longchat-7b-16k"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	rope_condense_ratio=8,
	),
	# https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json
	dict(
	name="longchat-13b-16k",
	hf_config=dict(org="lmsys", name="longchat-13b-16k"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	rope_condense_ratio=8,
	),
	]
	configs.extend(long_chat)


	######################
	# NousResearch Hermes
	######################
	nous_research = [
	# https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json
	dict(
	name="Nous-Hermes-llama-2-7b",
	hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
	padded_vocab_size=32000,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	),
	# https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json
	dict(
	name="Nous-Hermes-13b",
	hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
	block_size=2048,
	vocab_size=32000,
	padded_vocab_size=32001,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-6,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	# https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b
	dict(
	name="Nous-Hermes-Llama2-13b",
	hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
	vocab_size=32000,
	padded_vocab_size=32032,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	]
	configs.extend(nous_research)


	###############
	# Meta LLaMA 2
	###############
	llama_2 = [
	# https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
	dict(
	name="Llama-2-7b{}-hf",
	hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	),
	# https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
	dict(
	name="Llama-2-13b{}-hf",
	hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
	vocab_size=32000,
	padding_multiple=64,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	# https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
	dict(
	name="Llama-2-70b{}-hf",
	hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
	vocab_size=32000,
	padding_multiple=64,
	n_layer=80,
	n_head=64,
	n_embd=8192,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=28672,
	),
	]
	for c in llama_2:
	for kind in ("", "-chat"):
	copy = c.copy()
	copy["name"] = c["name"].format(kind)
	copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
	configs.append(copy)


	##########################
	# Stability AI FreeWilly2
	##########################
	freewilly_2 = [
	# https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
	dict(
	name="FreeWilly2",
	hf_config=dict(org="stabilityai", name="FreeWilly2"),
	vocab_size=32000,
	padding_multiple=64,
	n_layer=80,
	n_head=64,
	n_embd=8192,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=28672,
	)
	]
	configs.extend(freewilly_2)


	##################
	# Meta Code Llama
	##################
	code_llama = [
	# https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
	dict(
	name="CodeLlama-7b-hf",
	hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
	block_size=16384,
	vocab_size=32016,
	padding_multiple=16,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
	dict(
	name="CodeLlama-13b-hf",
	hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
	block_size=16384,
	vocab_size=32016,
	padding_multiple=16,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
	dict(
	name="CodeLlama-34b-hf",
	hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=48,
	n_head=64,
	n_embd=8192,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=22016,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
	dict(
	name="CodeLlama-7b-Python-hf",
	hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
	dict(
	name="CodeLlama-13b-Python-hf",
	hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
	dict(
	name="CodeLlama-34b-Python-hf",
	hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=48,
	n_head=64,
	n_embd=8192,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=22016,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/tree/main/config.json
	dict(
	name="CodeLlama-7b-Instruct-hf",
	hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
	block_size=16384,
	vocab_size=32016,
	padding_multiple=16,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
	dict(
	name="CodeLlama-13b-Instruct-hf",
	hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
	block_size=2048,
	vocab_size=32016,
	padding_multiple=16,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	rope_base=1000000,
	),
	# https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
	dict(
	name="CodeLlama-34b-Instruct-hf",
	hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
	block_size=16384,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=48,
	n_head=64,
	n_embd=8192,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=22016,
	rope_base=1000000,
	),
	]
	configs.extend(code_llama)


	########################
	# garage-bAInd Platypus
	########################
	platypus = [
	# https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
	dict(
	name="Platypus-30B",
	hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
	block_size=2048,
	padded_vocab_size=32000,
	n_layer=60,
	n_head=52,
	n_embd=6656,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-06,
	_mlp_class="LLaMAMLP",
	intermediate_size=17920,
	),
	# https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
	dict(
	name="Platypus2-7B",
	hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
	padded_vocab_size=32000,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	),
	# https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
	dict(
	name="Platypus2-13B",
	hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
	padded_vocab_size=32000,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	# https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
	dict(
	name="Platypus2-70B",
	hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
	padded_vocab_size=32000,
	n_layer=80,
	n_head=64,
	n_embd=8192,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=28672,
	),
	# https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
	dict(
	name="Camel-Platypus2-13B",
	hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
	padded_vocab_size=32000,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	# https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
	dict(
	name="Camel-Platypus2-70B",
	hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
	padded_vocab_size=32000,
	n_layer=80,
	n_head=64,
	n_embd=8192,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=28672,
	),
	# https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
	dict(
	name="Stable-Platypus2-13B",
	hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
	padded_vocab_size=32000,
	n_layer=40,
	n_head=40,
	n_embd=5120,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=13824,
	),
	# https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
	dict(
	name="Platypus2-70B-instruct",
	hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
	padded_vocab_size=32000,
	n_layer=80,
	n_head=64,
	n_embd=8192,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=28672,
	),
	]
	configs.extend(platypus)


	##########################
	# Stability AI StableCode
	##########################
	stablecode = [
	# https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
	dict(
	name="stablecode-completion-alpha-3b",
	hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
	block_size=16384,
	vocab_size=49152,
	n_layer=32,
	n_embd=2560,
	),
	# https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
	dict(
	name="stablecode-completion-alpha-3b-4k",
	hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
	vocab_size=49152,
	n_layer=32,
	n_embd=2560,
	),
	# https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
	dict(
	name="stablecode-instruct-alpha-3b",
	hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
	vocab_size=49152,
	n_layer=32,
	n_embd=2560,
	),
	]
	configs.extend(stablecode)


	##################################
	# togethercomputer LLaMA-2-7B-32K
	##################################
	together_llama2_32k = [
	# https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
	dict(
	name="LLaMA-2-7B-32K",
	hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
	vocab_size=32000,
	padding_multiple=64,
	n_layer=32,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	_mlp_class="LLaMAMLP",
	intermediate_size=11008,
	rope_condense_ratio=8,
	)
	]
	configs.extend(together_llama2_32k)


	################
	# Microsoft Phi
	################
	phi = [
	# https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
	dict(
	name="phi-1_5",
	hf_config=dict(org="microsoft", name="phi-1_5"),
	vocab_size=50257,
	padded_vocab_size=51200,
	block_size=2048,
	n_embd=2048,
	n_layer=24,
	rotary_percentage=0.5, # 32 / (n_embd / n_head) = 32 / 64
	shared_attention_norm=True,
	lm_head_bias=True,
	gelu_approximate="tanh",
	)
	]
	configs.extend(phi)


	#############
	# Mistral AI
	#############
	mistral = [
	# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
	dict(
	name="Mistral-7B-{}v0.1",
	hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
	padded_vocab_size=32000,
	block_size=4096, # should be 32768 but sliding window attention is not implemented
	n_layer=32,
	n_query_groups=8,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm",
	norm_eps=1e-05,
	_mlp_class="LLaMAMLP",
	intermediate_size=14336,
	)
	]
	for c in mistral:
	for kind in ("", "Instruct-"):
	copy = c.copy()
	copy["name"] = c["name"].format(kind)
	copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
	configs.append(copy)


	############
	# TinyLlama
	############
	tiny_llama = [
	dict(
	name="tiny-llama-1.1b",
	hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
	block_size=2048,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=22,
	n_head=32,
	n_embd=2048,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm", # original TinyLlama uses FusedRMSNorm
	norm_eps=1e-5,
	_mlp_class="LLaMAMLP",
	intermediate_size=5632,
	n_query_groups=4,
	),
	dict(
	name="tiny-llama-new",
	hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
	block_size=768,
	vocab_size=32000,
	padding_multiple=64,
	n_layer=18,
	n_head=32,
	n_embd=1024,
	rotary_percentage=1.0,
	parallel_residual=False,
	bias=False,
	_norm_class="RMSNorm", # original TinyLlama uses FusedRMSNorm
	norm_eps=1e-5,
	_mlp_class="LLaMAMLP",
	intermediate_size=5632,
	n_query_groups=4,
	),
	]
	configs.extend(tiny_llama)


	name_to_config = {config["name"]: config for config in configs}