File size: 3,775 Bytes
6a867d4 137c45d b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 b0bdc19 6a867d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
# https://github.com/rasbt/LLMs-from-scratch/blob/main/ch05/07_gpt_to_llama/standalone-llama32.ipynb
import os
from pathlib import Path
import tiktoken
from tiktoken.load import load_tiktoken_bpe
class Llama3Tokenizer:
"""Thin wrapper around tiktoken that keeps track of Llama-3 special IDs."""
def __init__(self, model_path):
if not os.path.isfile(model_path):
raise FileNotFoundError(model_path)
mergeable = load_tiktoken_bpe(model_path)
# hard-coded from Meta's tokenizer.json
self.special = {
"<|begin_of_text|>": 128000,
"<|end_of_text|>": 128001,
"<|start_header_id|>": 128006,
"<|end_header_id|>": 128007,
"<|eot_id|>": 128009,
}
self.special.update({f"<|reserved_{i}|>": 128002 + i
for i in range(256)
if 128002 + i not in self.special.values()})
self.model = tiktoken.Encoding(
name=Path(model_path).name,
pat_str=r"(?i:'s|'t|'re|'ve|'m|'ll|'d)"
r"|[^\r\n\p{L}\p{N}]?\p{L}+"
r"|\p{N}{1,3}"
r"| ?[^\s\p{L}\p{N}]+[\r\n]*"
r"|\s*[\r\n]+"
r"|\s+(?!\S)"
r"|\s+",
mergeable_ranks=mergeable,
special_tokens=self.special,
)
def encode(self, text, bos=False, eos=False, allowed_special=set()):
ids: list[int] = []
if bos:
ids.append(self.special_tokens["<|begin_of_text|>"])
# delegate to underlying tiktoken.Encoding.encode
ids.extend(
self.model.encode(
text,
allowed_special=allowed_special,
)
)
if eos:
ids.append(self.special_tokens["<|end_of_text|>"])
return ids
def decode(self, ids):
return self.model.decode(ids)
class ChatFormat:
def __init__(self, tokenizer: Llama3Tokenizer, *,
default_system="You are a helpful assistant."):
self.tok = tokenizer
self.default_system = default_system
def _header(self, role):
"""Encode <|start_header_id|>role<|end_header_id|>\n\n"""
return (
[self.tok.special["<|start_header_id|>"]]
+ self.tok.encode(role)
+ [self.tok.special["<|end_header_id|>"]]
+ self.tok.encode("\n\n")
)
def encode(self, user_message, system_message=None, allowed_special=None):
sys_msg = system_message if system_message is not None else self.default_system
ids = [self.tok.special["<|begin_of_text|>"]]
# system
ids += self._header("system")
ids += self.tok.encode(sys_msg, allowed_special=allowed_special)
ids += [self.tok.special["<|eot_id|>"]]
# user
ids += self._header("user")
ids += self.tok.encode(user_message)
ids += [self.tok.special["<|eot_id|>"]]
# assistant header (no content yet)
ids += self._header("assistant")
return ids
def decode(self, ids):
return self.tok.decode(ids)
def clean_text(text, header_end="assistant<|end_header_id|>\n\n"):
# Find the index of the first occurrence of "<|end_header_id|>"
index = text.find(header_end)
if index != -1:
# Return the substring starting after "<|end_header_id|>"
return text[index + len(header_end):].strip() # Strip removes leading/trailing whitespace
else:
# If the token is not found, return the original text
return text
|