File size: 4,618 Bytes
12001a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import sys
import time
import warnings
from pathlib import Path
from typing import Optional
import lightning as L
import torch
from generate import generate
from lit_llama import Tokenizer, LLaMA
from lit_llama.lora import lora
from lit_llama.utils import EmptyInitOnDevice, lazy_load, llama_model_lookup
from scripts.prepare_alpaca import generate_prompt
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
def main(
prompt: str = "What food do lamas eat?",
input: str = "",
lora_path: Optional[Path] = None,
pretrained_path: Optional[Path] = None,
tokenizer_path: Optional[Path] = None,
quantize: Optional[str] = None,
dtype: str = "float32",
max_new_tokens: int = 100,
top_k: int = 200,
temperature: float = 0.8,
) -> None:
"""Generates a response based on a given instruction and an optional input.
This script will only work with checkpoints from the instruction-tuned LoRA model.
See `finetune_lora.py`.
Args:
prompt: The prompt/instruction (Alpaca style).
lora_path: Path to the checkpoint with trained LoRA weights, which are the output of
`finetune_lora.py`.
input: Optional input (Alpaca style).
pretrained_path: The path to the checkpoint with pretrained LLaMA weights.
tokenizer_path: The tokenizer path to load.
quantize: Whether to quantize the model and using which method:
``"llm.int8"``: LLM.int8() mode,
``"gptq.int4"``: GPTQ 4-bit mode.
dtype: The dtype to use during generation.
max_new_tokens: The number of generation steps to take.
top_k: The number of top most probable tokens to consider in the sampling process.
temperature: A value controlling the randomness of the sampling process. Higher values result in more random
samples.
"""
if not lora_path:
lora_path = Path("out/lora/alpaca/lit-llama-lora-finetuned.pth")
if not pretrained_path:
pretrained_path = Path(f"./checkpoints/lit-llama/7B/lit-llama.pth")
if not tokenizer_path:
tokenizer_path = Path("./checkpoints/lit-llama/tokenizer.model")
assert lora_path.is_file()
assert pretrained_path.is_file()
assert tokenizer_path.is_file()
if quantize is not None:
raise NotImplementedError("Quantization in LoRA is not supported yet")
fabric = L.Fabric(devices=1)
dt = getattr(torch, dtype, None)
if not isinstance(dt, torch.dtype):
raise ValueError(f"{dtype} is not a valid dtype.")
dtype = dt
print("Loading model ...", file=sys.stderr)
t0 = time.time()
with (lazy_load(pretrained_path) as pretrained_checkpoint,
lazy_load(lora_path) as adapter_checkpoint):
name = llama_model_lookup(pretrained_checkpoint)
with EmptyInitOnDevice(
device=fabric.device, dtype=dtype, quantization_mode=quantize
), lora(r=lora_r, alpha=lora_alpha, dropout=lora_dropout, enabled=True):
model = LLaMA.from_name(name)
# 1. Load the pretrained weights
model.load_state_dict(pretrained_checkpoint, strict=False)
# 2. Load the fine-tuned adapter weights
model.load_state_dict(adapter_checkpoint, strict=False)
print(f"Time to load model: {time.time() - t0:.02f} seconds.", file=sys.stderr)
model.eval()
model = fabric.setup_module(model)
tokenizer = Tokenizer(tokenizer_path)
sample = {"instruction": prompt, "input": input}
prompt = generate_prompt(sample)
encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device)
t0 = time.perf_counter()
output = generate(
model,
idx=encoded,
max_seq_length=max_new_tokens,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
eos_id=tokenizer.eos_id
)
t = time.perf_counter() - t0
output = tokenizer.decode(output)
output = output.split("### Response:")[1].strip()
print(output)
print(f"\n\nTime for inference: {t:.02f} sec total, {max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
if fabric.device.type == "cuda":
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
if __name__ == "__main__":
from jsonargparse import CLI
torch.set_float32_matmul_precision("high")
warnings.filterwarnings(
# Triggered internally at ../aten/src/ATen/EmptyTensor.cpp:31
"ignore",
message="ComplexHalf support is experimental and many operators don't support it yet"
)
CLI(main)
|