|
import importlib |
|
from enum import Enum, auto |
|
|
|
"""Interface: |
|
tokenizer.encode |
|
tokenizer.decode |
|
tokenizer.convert_ids_to_tokens |
|
|
|
tokenizer.parent = "" |
|
tokenizer.vocab_size |
|
tokenizer.get_vocab() # gpt-neox-20b, llama |
|
tokenizer.type = TokenizerType.ByteBPE.name |
|
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py |
|
- bert |
|
- 特征 |
|
- 示例: |
|
- gpt2 |
|
- 特征: |
|
- sentencepiece: |
|
- 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁, |
|
- 示例:llama,baichuan |
|
- tiktoken |
|
- icetk |
|
- hf_tokenizer |
|
- 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有 merge.txt |
|
- 示例:gpt_neox_20b, moss |
|
- gpt3.5 gpt4 |
|
- 特征:tiktoken |
|
tokenizer.comments = "split all numbers into individual digits, " \ |
|
"and fallback to bytes to decompose unknown UTF-8 characters" |
|
|
|
tokenizer.all_special_tokens # baichuan |
|
tokenizer.special_tokens_set # gpt3.5_turbo |
|
tokenizer.special_tokens_map |
|
|
|
tokenizer.dependency [sentencepiece, tiktoken, icetk] |
|
""" |
|
|
|
Animal = Enum('Animal', 'ANT BEE CAT DOG') |
|
|
|
uniq_tokenizers = [ |
|
"" |
|
] |
|
|
|
all_tokenizers = [ |
|
"gpt_35_turbo", |
|
"gpt4", |
|
"gpt2", |
|
"gpt2_chinese", |
|
"bert_base_cased", |
|
"bert_base_uncased", |
|
"bert_base_chinese", |
|
"kplug", |
|
"moss", |
|
|
|
|
|
|
|
|
|
|
|
|
|
"bloom", |
|
|
|
|
|
|
|
"gpt_nexo_20b", |
|
|
|
|
|
|
|
|
|
"chatglm_6b", |
|
"chatglm2-6b", |
|
|
|
|
|
"llama", |
|
"chinese_llama_lora_7b", |
|
|
|
|
|
|
|
"baichuan_7b", |
|
"qwen", |
|
"internlm_chat_7b", |
|
"goat", |
|
] |
|
|
|
|
|
class TokenizerType(Enum): |
|
""" |
|
- https://huggingface.co/docs/transformers/tokenizer_summary |
|
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py |
|
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48 |
|
- UNIGRAM = 1; // Unigram language model with dynamic algorithm |
|
- BPE = 2; // Byte Pair Encoding |
|
- WORD = 3; // Delimitered by whitespace. |
|
- CHAR = 4; // tokenizes into character sequence |
|
""" |
|
BPE = auto() |
|
ByteBPE = auto() |
|
GPT2BPETokenizer = auto() |
|
BERTTokenizer = auto() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TokenizerImpl(Enum): |
|
""" |
|
""" |
|
SentencePiece = auto() |
|
|
|
|
|
|
|
|
|
GPT2Tokenizer = auto() |
|
BertTokenizer = auto() |
|
|
|
|
|
def load_tokener(model_name): |
|
tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer |
|
return tokenizer |
|
|
|
|
|
if __name__ == "__main__": |
|
pass |
|
|