|
""" |
|
This script has functions and utilties for model export. |
|
Basically, we have a bunch of versions of the model, and we |
|
want to export them to .bin files to be read from and inferenced in C. |
|
|
|
Among the "input" versions of PyTorch files/models: |
|
- Official Llama 2 weights released by Meta |
|
- Huggingface weights available on the hub |
|
- llama2.c (this repo) trained models |
|
|
|
Among the "output" versions of .bin files: |
|
- v0: Legacy files of the original llama2.c repo (will eventually be DEPRECATED) |
|
- v1-vN: Improved .bin files with a proper header, cache alignment, etc. |
|
|
|
This script aspires to provide all of these conversions. |
|
""" |
|
import os |
|
import gzip |
|
import shutil |
|
import struct |
|
import argparse |
|
import json |
|
from pathlib import Path |
|
|
|
import numpy as np |
|
import torch |
|
from torch import nn |
|
|
|
from model import ModelArgs, Transformer |
|
|
|
|
|
|
|
|
|
def serialize_fp32(file, tensor): |
|
""" writes one fp32 tensor to file that is open in wb mode """ |
|
d = tensor.detach().cpu().view(-1).to(torch.float32).numpy() |
|
b = struct.pack(f'{len(d)}f', *d) |
|
file.write(b) |
|
|
|
def serialize_int8(file, tensor): |
|
""" writes one int8 tensor to file that is open in wb mode """ |
|
d = tensor.detach().cpu().view(-1).numpy().astype(np.int8) |
|
b = struct.pack(f'{len(d)}b', *d) |
|
file.write(b) |
|
|
|
def quantize_q80(w, group_size): |
|
""" |
|
takes a tensor and returns the Q8_0 quantized version |
|
i.e. symmetric quantization into int8, range [-127,127] |
|
""" |
|
assert w.numel() % group_size == 0 |
|
ori_shape = w.shape |
|
w = w.float() |
|
w = w.reshape(-1, group_size) |
|
|
|
wmax = torch.abs(w).max(dim=1).values |
|
|
|
scale = wmax / 127.0 |
|
|
|
quant = w / scale[:,None] |
|
|
|
int8val = torch.round(quant).to(torch.int8) |
|
|
|
fp32val = (int8val.float() * scale[:,None]).view(-1) |
|
fp32valr = fp32val.reshape(-1, group_size) |
|
|
|
err = torch.abs(fp32valr - w).max(dim=1).values |
|
|
|
maxerr = err.max().item() |
|
return int8val, scale, maxerr |
|
|
|
|
|
|
|
|
|
def legacy_export(model, filepath): |
|
""" Original export of llama2.c bin files, i.e. version v0 """ |
|
out_file = open(filepath, 'wb') |
|
|
|
|
|
hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] |
|
p = model.params |
|
shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) |
|
|
|
if not shared_classifier: |
|
p.vocab_size = -p.vocab_size |
|
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads |
|
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, |
|
n_kv_heads, p.vocab_size, p.max_seq_len) |
|
out_file.write(header) |
|
|
|
|
|
serialize_fp32(out_file, model.tok_embeddings.weight) |
|
|
|
|
|
|
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.attention_norm.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.attention.wq.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.attention.wk.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.attention.wv.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.attention.wo.weight) |
|
|
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.ffn_norm.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.feed_forward.w1.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.feed_forward.w2.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.feed_forward.w3.weight) |
|
|
|
serialize_fp32(out_file, model.norm.weight) |
|
|
|
serialize_fp32(out_file, model.freqs_cos[:p.max_seq_len]) |
|
serialize_fp32(out_file, model.freqs_sin[:p.max_seq_len]) |
|
|
|
|
|
if not shared_classifier: |
|
serialize_fp32(out_file, model.output.weight) |
|
|
|
|
|
out_file.close() |
|
print(f"wrote {filepath}") |
|
|
|
|
|
|
|
|
|
def version1_export(model, filepath): |
|
""" |
|
Export the model weights in full float32 .bin file to be read from C. |
|
This is same as legacy_export, but with a proper header. |
|
""" |
|
version = 1 |
|
|
|
out_file = open(filepath, 'wb') |
|
|
|
|
|
out_file.write(struct.pack('I', 0x616b3432)) |
|
|
|
out_file.write(struct.pack('i', version)) |
|
|
|
p = model.params |
|
hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] |
|
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads |
|
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, |
|
n_kv_heads, p.vocab_size, p.max_seq_len) |
|
out_file.write(header) |
|
|
|
shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) |
|
out_file.write(struct.pack('B', int(shared_classifier))) |
|
pad = 256 - out_file.tell() |
|
assert pad >= 0 |
|
out_file.write(b'\0' * pad) |
|
|
|
|
|
weights = [ |
|
*[layer.attention_norm.weight for layer in model.layers], |
|
*[layer.ffn_norm.weight for layer in model.layers], |
|
model.norm.weight, |
|
model.tok_embeddings.weight, |
|
*[layer.attention.wq.weight for layer in model.layers], |
|
*[layer.attention.wk.weight for layer in model.layers], |
|
*[layer.attention.wv.weight for layer in model.layers], |
|
*[layer.attention.wo.weight for layer in model.layers], |
|
*[layer.feed_forward.w1.weight for layer in model.layers], |
|
*[layer.feed_forward.w2.weight for layer in model.layers], |
|
*[layer.feed_forward.w3.weight for layer in model.layers], |
|
] |
|
if not shared_classifier: |
|
weights.append(model.output.weight) |
|
for w in weights: |
|
serialize_fp32(out_file, w) |
|
|
|
|
|
out_file.close() |
|
print(f"wrote {filepath}") |
|
|
|
def version2_export(model, filepath, group_size=64): |
|
""" |
|
Export the model weights in Q8_0 into .bin file to be read from C. |
|
That is: |
|
- quantize all weights to symmetric int8, in range [-127, 127] |
|
- all other tensors (the rmsnorm params) are kept and exported in fp32 |
|
- quantization is done in groups of group_size to reduce the effects of any outliers |
|
""" |
|
version = 2 |
|
|
|
|
|
while model.params.dim % group_size != 0: |
|
group_size //= 2 |
|
print(f"BACKOFF: reducing group size to {group_size} to fit hidden_dim") |
|
weights = [ |
|
model.tok_embeddings.weight, |
|
*[layer.attention.wq.weight for layer in model.layers], |
|
*[layer.attention.wk.weight for layer in model.layers], |
|
*[layer.attention.wv.weight for layer in model.layers], |
|
*[layer.attention.wo.weight for layer in model.layers], |
|
*[layer.feed_forward.w1.weight for layer in model.layers], |
|
*[layer.feed_forward.w2.weight for layer in model.layers], |
|
*[layer.feed_forward.w3.weight for layer in model.layers], |
|
] |
|
shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) |
|
if not shared_classifier: |
|
weights.append(model.output.weight) |
|
for w in weights: |
|
assert w.numel() % group_size == 0, f"weight {i} has numel {w.numel()}, not a multiple of group_size {group_size}" |
|
|
|
|
|
out_file = open(filepath, 'wb') |
|
|
|
|
|
out_file.write(struct.pack('I', 0x616b3432)) |
|
|
|
out_file.write(struct.pack('i', version)) |
|
|
|
p = model.params |
|
hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] |
|
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads |
|
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, |
|
n_kv_heads, p.vocab_size, p.max_seq_len) |
|
out_file.write(header) |
|
|
|
out_file.write(struct.pack('B', int(shared_classifier))) |
|
out_file.write(struct.pack('i', group_size)) |
|
pad = 256 - out_file.tell() |
|
assert pad >= 0 |
|
out_file.write(b'\0' * pad) |
|
|
|
|
|
|
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.attention_norm.weight) |
|
for layer in model.layers: |
|
serialize_fp32(out_file, layer.ffn_norm.weight) |
|
serialize_fp32(out_file, model.norm.weight) |
|
|
|
|
|
|
|
ew = [] |
|
for i, w in enumerate(weights): |
|
|
|
q, s, err = quantize_q80(w, group_size) |
|
|
|
serialize_int8(out_file, q) |
|
serialize_fp32(out_file, s) |
|
|
|
ew.append((err, w.shape)) |
|
print(f"{i+1}/{len(weights)} quantized {tuple(w.shape)} to Q8_0 with max error {err}") |
|
|
|
|
|
ew.sort(reverse=True) |
|
print(f"max quantization group error across all weights: {ew[0][0]}") |
|
|
|
|
|
out_file.close() |
|
print(f"wrote {filepath}") |
|
|
|
def hf_export(llama_model, filepath, group_size=64, dtype=torch.float32): |
|
""" Generate the pytorch_model.bin state_dict and config.json for HuggingFace """ |
|
|
|
try: |
|
from transformers.models.llama.configuration_llama import LlamaConfig |
|
except ImportError: |
|
print("Error: transformers package is required to load huggingface models") |
|
print("Please run `pip install transformers` to install it") |
|
return None |
|
|
|
|
|
hf_state_dict = {} |
|
|
|
|
|
dim = llama_model.params.dim |
|
num_key_value_heads = llama_model.params.n_kv_heads |
|
n_rep = llama_model.params.n_heads // num_key_value_heads |
|
key_value_dim = dim // n_rep |
|
|
|
|
|
|
|
def permute_original(w, n_heads=llama_model.params.n_heads, dim1=dim, dim2=dim): |
|
return w.view(dim1, dim2).reshape(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) |
|
|
|
|
|
hf_state_dict['model.embed_tokens.weight'] = llama_model.tok_embeddings.weight.clone().to(dtype) |
|
hf_state_dict['model.norm.weight'] = llama_model.norm.weight.clone().to(dtype) |
|
|
|
|
|
for i, layer in enumerate(llama_model.layers): |
|
layer_id = layer.layer_id |
|
hf_state_dict[f'model.layers.{i}.input_layernorm.weight'] = llama_model.layers[layer_id].attention_norm.weight.clone().to(dtype) |
|
hf_state_dict[f'model.layers.{i}.self_attn.q_proj.weight'] = permute_original(llama_model.layers[layer_id].attention.wq.weight.clone()).to(dtype) |
|
hf_state_dict[f'model.layers.{i}.self_attn.k_proj.weight'] = permute_original(llama_model.layers[layer_id].attention.wk.weight.clone(), num_key_value_heads, key_value_dim, dim).to(dtype) |
|
hf_state_dict[f'model.layers.{i}.self_attn.v_proj.weight'] = llama_model.layers[layer_id].attention.wv.weight.clone().to(dtype) |
|
hf_state_dict[f'model.layers.{i}.self_attn.o_proj.weight'] = llama_model.layers[layer_id].attention.wo.weight.clone().to(dtype) |
|
hf_state_dict[f'model.layers.{i}.post_attention_layernorm.weight'] = llama_model.layers[layer_id].ffn_norm.weight.clone().to(dtype) |
|
hf_state_dict[f'model.layers.{i}.mlp.gate_proj.weight'] = llama_model.layers[layer_id].feed_forward.w1.weight.clone().to(dtype) |
|
hf_state_dict[f'model.layers.{i}.mlp.down_proj.weight'] = llama_model.layers[layer_id].feed_forward.w2.weight.clone().to(dtype) |
|
hf_state_dict[f'model.layers.{i}.mlp.up_proj.weight'] = llama_model.layers[layer_id].feed_forward.w3.weight.clone().to(dtype) |
|
|
|
|
|
hf_state_dict['lm_head.weight'] = hf_state_dict['model.embed_tokens.weight'] |
|
|
|
|
|
_embeddings_are_tied: bool = torch.equal(llama_model.tok_embeddings.weight, llama_model.output.weight) |
|
if not _embeddings_are_tied: |
|
hf_state_dict['lm_head.weight'] = llama_model.output.weight.clone().to(dtype) |
|
|
|
|
|
|
|
|
|
|
|
vocab_size = llama_model.params.vocab_size |
|
hidden_size = llama_model.params.dim |
|
intermediate_size = llama_model.layers[0].feed_forward.w1.weight.shape[0] |
|
num_hidden_layers = llama_model.params.n_layers |
|
num_attention_heads = llama_model.params.n_heads |
|
num_key_value_heads = llama_model.params.n_kv_heads |
|
max_position_embeddings = llama_model.params.max_seq_len |
|
rms_norm_eps = llama_model.params.norm_eps |
|
|
|
|
|
|
|
|
|
|
|
config = LlamaConfig( |
|
vocab_size=vocab_size, |
|
hidden_size=hidden_size, |
|
intermediate_size=intermediate_size, |
|
num_hidden_layers=num_hidden_layers, |
|
num_attention_heads=num_attention_heads, |
|
num_key_value_heads=num_key_value_heads, |
|
max_position_embeddings=max_position_embeddings, |
|
rms_norm_eps=rms_norm_eps, |
|
tie_word_embeddings=_embeddings_are_tied, |
|
|
|
architectures=["LlamaForCausalLM"], |
|
hidden_act="silu", |
|
) |
|
|
|
|
|
|
|
|
|
os.makedirs(filepath, exist_ok=True) |
|
|
|
|
|
torch.save(hf_state_dict, os.path.join(filepath, "pytorch_model.bin")) |
|
config.save_pretrained(filepath) |
|
|
|
|
|
|
|
|
|
|
|
def load_checkpoint(checkpoint): |
|
|
|
|
|
checkpoint_dict = torch.load(checkpoint, map_location='cpu') |
|
gptconf = ModelArgs(**checkpoint_dict['model_args']) |
|
model = Transformer(gptconf) |
|
state_dict = checkpoint_dict['model'] |
|
unwanted_prefix = '_orig_mod.' |
|
for k,v in list(state_dict.items()): |
|
if k.startswith(unwanted_prefix): |
|
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) |
|
model.load_state_dict(state_dict, strict=False) |
|
model.eval() |
|
return model |
|
|
|
def load_meta_model(model_path): |
|
params_path = os.path.join(model_path, 'params.json') |
|
with open(params_path) as f: |
|
params = json.load(f) |
|
print(params) |
|
|
|
model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth'))) |
|
models = [torch.load(p, map_location='cpu') for p in model_paths] |
|
|
|
def concat_weights(models): |
|
state_dict = {} |
|
for name in list(models[0]): |
|
tensors = [model[name] for model in models] |
|
if len(tensors) == 1 or len(tensors[0].shape) == 1: |
|
state_dict[name] = tensors[0] |
|
continue |
|
is_axis_1 = ( |
|
name.startswith('tok_embeddings.') |
|
or name.endswith('.attention.wo.weight') |
|
or name.endswith('.feed_forward.w2.weight') |
|
) |
|
axis = 1 if is_axis_1 else 0 |
|
state_dict[name] = torch.cat(tensors, dim=axis) |
|
for model in models: |
|
del model[name] |
|
return state_dict |
|
|
|
state_dict = concat_weights(models) |
|
del models |
|
|
|
|
|
config = ModelArgs() |
|
config.dim = params["dim"] |
|
config.n_layers = params["n_layers"] |
|
config.n_heads = params["n_heads"] |
|
config.n_kv_heads = params.get('n_kv_heads') or params['n_heads'] |
|
config.multiple_of = params["multiple_of"] |
|
config.norm_eps = params["norm_eps"] |
|
|
|
config.vocab_size = state_dict['tok_embeddings.weight'].shape[0] |
|
config.max_seq_len = 2048 |
|
|
|
|
|
|
|
model = Transformer(config) |
|
|
|
model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight']) |
|
model.norm.weight = nn.Parameter(state_dict['norm.weight']) |
|
|
|
for layer in model.layers: |
|
i = layer.layer_id |
|
layer.attention_norm.weight = nn.Parameter(state_dict[f'layers.{i}.attention_norm.weight']) |
|
layer.attention.wq.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wq.weight']) |
|
layer.attention.wk.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wk.weight']) |
|
layer.attention.wv.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wv.weight']) |
|
layer.attention.wo.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wo.weight']) |
|
layer.ffn_norm.weight = nn.Parameter(state_dict[f'layers.{i}.ffn_norm.weight']) |
|
layer.feed_forward.w1.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w1.weight']) |
|
layer.feed_forward.w2.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w2.weight']) |
|
layer.feed_forward.w3.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w3.weight']) |
|
|
|
|
|
model.output.weight = nn.Parameter(state_dict['output.weight']) |
|
model.eval() |
|
return model |
|
|
|
def load_hf_model(model_path): |
|
|
|
try: |
|
from transformers import AutoModelForCausalLM |
|
except ImportError: |
|
print("Error: transformers package is required to load huggingface models") |
|
print("Please run `pip install transformers` to install it") |
|
return None |
|
|
|
|
|
hf_model = AutoModelForCausalLM.from_pretrained(model_path) |
|
hf_dict = hf_model.state_dict() |
|
|
|
|
|
config = ModelArgs() |
|
config.dim = hf_model.config.hidden_size |
|
config.n_layers = hf_model.config.num_hidden_layers |
|
config.n_heads = hf_model.config.num_attention_heads |
|
config.n_kv_heads = hf_model.config.num_attention_heads |
|
config.vocab_size = hf_model.config.vocab_size |
|
config.hidden_dim = hf_model.config.intermediate_size |
|
config.norm_eps = hf_model.config.rms_norm_eps |
|
config.max_seq_len = hf_model.config.max_position_embeddings |
|
|
|
|
|
model = Transformer(config) |
|
|
|
model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight']) |
|
model.norm.weight = nn.Parameter(hf_dict['model.norm.weight']) |
|
|
|
|
|
def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim): |
|
return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2) |
|
|
|
for layer in model.layers: |
|
i = layer.layer_id |
|
layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight']) |
|
layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight'])) |
|
layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight'])) |
|
layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight']) |
|
layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight']) |
|
layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight']) |
|
layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight']) |
|
layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight']) |
|
layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight']) |
|
|
|
|
|
model.output.weight = nn.Parameter(hf_dict['lm_head.weight']) |
|
model.eval() |
|
return model |
|
|
|
|
|
|
|
|
|
|
|
def model_export(model, filepath, version, dtype=torch.float32): |
|
""" |
|
Versions docs: |
|
v-1:huggingface export, i.e. intended for use outside of this repo, in HF |
|
v0: legacy llama2.c float format, DEPRECATED |
|
v1: float32 export |
|
v2: int8 quantized Q8_0 export, similar to llama.cpp, in groups |
|
# TODO: add dtype export support for other versions (?) |
|
""" |
|
if version == 0: |
|
legacy_export(model, filepath) |
|
elif version == 1: |
|
version1_export(model, filepath) |
|
elif version == 2: |
|
version2_export(model, filepath) |
|
elif version == -1: |
|
hf_export(model, filepath, dtype) |
|
else: |
|
raise ValueError(f"unknown version {version}") |
|
|
|
def torchscript_export(model, filepath, zero_params=False, gzip_output=False): |
|
""" |
|
(This was submitted via a PR earlier. Leaving it here, but "orphaned" for now) |
|
Saves the model as a TorchScript. |
|
The resulting file can be loaded in C++ code and then used for training or |
|
inference with: |
|
#include <torch/script.h> |
|
torch::jit::Module module = torch::jit::load("model.pt") |
|
Note that the serialized model includes the initial parameters and with the default |
|
ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute |
|
the model parameters separately you can zero out the parameters before saving it and |
|
it will gzip down to 780K. |
|
""" |
|
|
|
|
|
|
|
if zero_params: |
|
for p in model.parameters(): |
|
p.detach().zero_() |
|
|
|
torch.jit.save(torch.jit.script(model), filepath) |
|
|
|
if gzip_output: |
|
with open(filepath, "rb") as f_in: |
|
with gzip.open(f"{filepath}.gz", "wb") as f_out: |
|
shutil.copyfileobj(f_in, f_out) |
|
os.unlink(filepath) |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("filepath", type=str, help="the output filepath") |
|
parser.add_argument("--version", default=0, type=int, help="the version to export with") |
|
parser.add_argument("--dtype", type=str, help="dtype of the model (fp16, fp32)", default="fp32") |
|
group = parser.add_mutually_exclusive_group(required=True) |
|
group.add_argument("--checkpoint", type=str, help="model checkpoint, .pt file") |
|
group.add_argument("--meta-llama", type=str, help="meta llama model path") |
|
group.add_argument("--hf", type=str, help="huggingface model path") |
|
args = parser.parse_args() |
|
dtype = {"fp16": torch.float16, "fp32": torch.float32}[args.dtype] |
|
|
|
if args.checkpoint: |
|
model = load_checkpoint(args.checkpoint) |
|
elif args.meta_llama: |
|
model = load_meta_model(args.meta_llama) |
|
elif args.hf: |
|
model = load_hf_model(args.hf) |
|
|
|
if model is None: |
|
parser.error("Can't load input model!") |
|
|
|
|
|
model_export(model, args.filepath, args.version, args.dtype) |
|
|