File size: 3,976 Bytes
751936e d10ecd7 428b731 d10ecd7 428b731 d10ecd7 428b731 d10ecd7 428b731 751936e 819cf7f 751936e d10ecd7 751936e d10ecd7 751936e 428b731 d10ecd7 751936e d10ecd7 751936e d10ecd7 751936e d10ecd7 751936e d10ecd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import importlib
from enum import Enum, auto
"""Interface:
tokenizer.encode
tokenizer.decode
tokenizer.convert_ids_to_tokens
tokenizer.parent = ""
tokenizer.vocab_size
tokenizer.get_vocab() # gpt-neox-20b, llama
tokenizer.type = TokenizerType.ByteBPE.name
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
- bert
- 特征
- 示例:
- gpt2
- 特征:
- sentencepiece:
- 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
- 示例:llama,baichuan
- tiktoken
- icetk
- hf_tokenizer
- 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有 merge.txt
- 示例:gpt_neox_20b, moss
- gpt3.5 gpt4
- 特征:tiktoken
tokenizer.comments = "split all numbers into individual digits, " \
"and fallback to bytes to decompose unknown UTF-8 characters"
tokenizer.all_special_tokens # baichuan
tokenizer.special_tokens_set # gpt3.5_turbo
tokenizer.special_tokens_map
tokenizer.dependency [sentencepiece, tiktoken, icetk]
"""
Animal = Enum('Animal', 'ANT BEE CAT DOG')
uniq_tokenizers = [
""
]
all_tokenizers = [
"gpt_35_turbo",
"gpt_4",
"gpt2",
"gpt2_chinese",
"bert_base_cased",
"bert_base_uncased",
"bert_base_chinese",
"kplug",
"moss",
#
# ######
# "chatyuan_large_v2",
# "prompt_clue",
#
# #### bloom 系列
"bloom",
# "bloomz_6b4_zh",
# "belle_7b_2m", # 模型和词典都基于bloom
#
"gpt_nexo_20b",
# "gpt_neox_chinese_v1",
#
# ##### glm系列
# "glm_chinese",
"chatglm_6b",
"chatglm2-6b",
#
# #### llama alpaca系列
"llama", # '中文单字': 700, '中文多字': 0
"chinese_llama_lora_7b", #
# "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
# "belle_llama_ext_7b",
# "alpaca_7b",
"baichuan_7b",
"qwen",
"internlm_chat_7b",
"goat",
]
class TokenizerType(Enum):
"""
- https://huggingface.co/docs/transformers/tokenizer_summary
- https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
- https://github.com/google/sentencepiece/blob/3863f7648e5d8edb571ac592f3ac4f5f0695275a/src/sentencepiece_model.proto#L48
- UNIGRAM = 1; // Unigram language model with dynamic algorithm
- BPE = 2; // Byte Pair Encoding
- WORD = 3; // Delimitered by whitespace.
- CHAR = 4; // tokenizes into character sequence
"""
BPE = auto()
ByteBPE = auto() # BBPE Byte-Level BPE
GPT2BPETokenizer = auto() #
BERTTokenizer = auto()
# class TokenizerType(Enum):
#
# # BERTTokenizer
# # 依赖一个txt文件
#
#
# # https://github.com/EleutherAI/gpt-neox/blob/v2.0/megatron/tokenizer/tokenizer.py#L231
# # 依赖一个json文件,Tokenizer.from_file(vocab_file)
# # 案例:gpt-neox-20B
# HFTokenizer = auto()
#
# # 依赖: model_file, sentencepiece.SentencePieceProcessor(model_file)
# # 案例:
# SentencePieceTokenizer = auto()
#
#
# # 依赖: 3个json文件:vocab.json, merges.txt, special_tokens.txt
# # 源码:
# # - https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/tokenizer/gpt2_tokenization.py#L92
# # Byte-level BPE
# GPT2BPETokenizer = auto()
class TokenizerImpl(Enum):
"""
"""
SentencePiece = auto() #
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
# 构造词典:
#
GPT2Tokenizer = auto()
BertTokenizer = auto() #
def load_tokener(model_name):
tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
return tokenizer
if __name__ == "__main__":
pass
|