import importlib from enum import Enum, auto """ Interface: - tokenizer.parent = "" tokenizer.type = TokenizerType.ByteBPE.name tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py tokenizer.comments = "split all numbers into individual digits, " \ "and fallback to bytes to decompose unknown UTF-8 characters" """ Animal = Enum('Animal', 'ANT BEE CAT DOG') uniq_tokenizers = [ "" ] all_tokenizers = [ "gpt_35_turbo", "gpt2", "gpt2_chinese", "bert_chinese", "moss", # # ###### # "chatyuan_large_v2", # "prompt_clue", # # #### bloom 系列 # "bloom", # "bloomz_6b4_zh", # "belle_7b_2m", # 模型和词典都基于bloom # "gpt_nexo_20b", # "gpt_neox_chinese_v1", # # ##### glm系列 # "glm_chinese", "chatglm_6b", # # #### llama alpaca系列 "llama", # '中文单字': 700, '中文多字': 0 "chinese_llama_lora_7b", # # "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。 # "belle_llama_ext_7b", # "alpaca_7b", "baichuan_7b", "qwen" ] class TokenizerType(Enum): """ - https://huggingface.co/docs/transformers/tokenizer_summary - https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py - https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48 - UNIGRAM = 1; // Unigram language model with dynamic algorithm - BPE = 2; // Byte Pair Encoding - WORD = 3; // Delimitered by whitespace. - CHAR = 4; // tokenizes into character sequence """ BPE = auto() ByteBPE = auto() # BBPE Byte-Level BPE GPT2BPETokenizer = auto() # BERTTokenizer = auto() # class TokenizerType(Enum): # # # BERTTokenizer # # 依赖一个txt文件 # # # # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231 # # 依赖一个json文件,Tokenizer.from_file(vocab_file) # # 案例:gpt-neox-20B # HFTokenizer = auto() # # # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file) # # 案例: # SentencePieceTokenizer = auto() # # # # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt # # 源码: # # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92 # # Byte-level BPE # GPT2BPETokenizer = auto() class TokenizerImpl(Enum): """ """ SentencePiece = auto() # # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104 # 构造词典: # GPT2Tokenizer = auto() BertTokenizer = auto() # def load_tokener(model_name): tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer return tokenizer if __name__ == "__main__": pass