import importlib from enum import Enum, auto """Interface: tokenizer.encode tokenizer.decode tokenizer.convert_ids_to_tokens tokenizer.parent = "" tokenizer.vocab_size tokenizer.get_vocab() # gpt-neox-20b, llama tokenizer.type = TokenizerType.ByteBPE.name tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py "HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py - bert - 特征 - 示例: - gpt2 - 特征: - sentencepiece: - 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁, - 示例:llama,baichuan - tiktoken - icetk - hf_tokenizer - 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有1个tokenizer.json(包括 merge vocab),或者分开独立文件 - 示例:gpt_neox_20b, moss - tiktoken - 特征:空格就是空格, - 示例:gpt3.5 gpt4 tokenizer.comments = "split all numbers into individual digits, " \ "and fallback to bytes to decompose unknown UTF-8 characters" tokenizer.all_special_tokens # baichuan tokenizer.special_tokens_set # gpt3.5_turbo tokenizer.special_tokens_map tokenizer.dependency [sentencepiece, tiktoken, icetk] """ Animal = Enum('Animal', 'ANT BEE CAT DOG') uniq_tokenizers = [ "" ] all_tokenizers = [ "gpt_35_turbo", "gpt_4", "gpt2", "gpt2_chinese", "bert_base_cased", "bert_base_uncased", "bert_base_chinese", "kplug", "moss", # # ###### # "chatyuan_large_v2", # "prompt_clue", # # #### bloom 系列 "bloom", # "bloomz_6b4_zh", # "belle_7b_2m", # 模型和词典都基于bloom # "gpt_nexo_20b", # "gpt_neox_chinese_v1", # # ##### glm系列 # "glm_chinese", "chatglm_6b", "chatglm2-6b", # # #### llama alpaca系列 "llama", # '中文单字': 700, '中文多字': 0 "chinese_llama", # "chinese_llama2", # # "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 # "belle_llama_ext_7b", # "alpaca_7b", "baichuan_7b", "qwen", "internlm_chat_7b", "goat", ] class TokenizerType(Enum): """ - https://huggingface.co/docs/transformers/tokenizer_summary - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48 - UNIGRAM = 1; // Unigram language model with dynamic algorithm - BPE = 2; // Byte Pair Encoding - WORD = 3; // Delimitered by whitespace. - CHAR = 4; // tokenizes into character sequence """ BPE = auto() ByteBPE = auto() # BBPE Byte-Level BPE GPT2BPETokenizer = auto() # BERTTokenizer = auto() # class TokenizerType(Enum): # # # BERTTokenizer # # 依赖一个txt文件 # # # # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231 # # 依赖一个json文件,Tokenizer.from_file(vocab_file) # # 案例:gpt-neox-20B # HFTokenizer = auto() # # # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file) # # 案例: # SentencePieceTokenizer = auto() # # # # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt # # 源码: # # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92 # # Byte-level BPE # GPT2BPETokenizer = auto() class TokenizerImpl(Enum): """ """ SentencePiece = auto() # # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104 # 构造词典: # GPT2Tokenizer = auto() BertTokenizer = auto() # def load_tokener(model_name): tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer return tokenizer if __name__ == "__main__": pass