kcz358 commited on
Commit
e6f7e39
·
verified ·
1 Parent(s): 459b28a

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<image>": 151646,
3
+ "<|endoftext|>": 151643,
4
+ "<|im_end|>": 151645,
5
+ "<|im_start|>": 151644
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -3,13 +3,6 @@
3
  "<|im_start|>",
4
  "<|im_end|>"
5
  ],
6
- "bos_token": {
7
- "content": "<|im_start|>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false
12
- },
13
  "eos_token": {
14
  "content": "<|im_end|>",
15
  "lstrip": false,
 
3
  "<|im_start|>",
4
  "<|im_end|>"
5
  ],
 
 
 
 
 
 
 
6
  "eos_token": {
7
  "content": "<|im_end|>",
8
  "lstrip": false,
tokenizer.json CHANGED
@@ -63,58 +63,10 @@
63
  ]
64
  },
65
  "post_processor": {
66
- "type": "TemplateProcessing",
67
- "single": [
68
- {
69
- "SpecialToken": {
70
- "id": "<|im_start|>",
71
- "type_id": 0
72
- }
73
- },
74
- {
75
- "Sequence": {
76
- "id": "A",
77
- "type_id": 0
78
- }
79
- }
80
- ],
81
- "pair": [
82
- {
83
- "SpecialToken": {
84
- "id": "<|im_start|>",
85
- "type_id": 0
86
- }
87
- },
88
- {
89
- "Sequence": {
90
- "id": "A",
91
- "type_id": 0
92
- }
93
- },
94
- {
95
- "SpecialToken": {
96
- "id": "<|im_start|>",
97
- "type_id": 1
98
- }
99
- },
100
- {
101
- "Sequence": {
102
- "id": "B",
103
- "type_id": 1
104
- }
105
- }
106
- ],
107
- "special_tokens": {
108
- "<|im_start|>": {
109
- "id": "<|im_start|>",
110
- "ids": [
111
- 151644
112
- ],
113
- "tokens": [
114
- "<|im_start|>"
115
- ]
116
- }
117
- }
118
  },
119
  "decoder": {
120
  "type": "ByteLevel",
@@ -130,6 +82,7 @@
130
  "end_of_word_suffix": "",
131
  "fuse_unk": false,
132
  "byte_fallback": false,
 
133
  "vocab": {
134
  "!": 0,
135
  "\"": 1,
 
63
  ]
64
  },
65
  "post_processor": {
66
+ "type": "ByteLevel",
67
+ "add_prefix_space": false,
68
+ "trim_offsets": false,
69
+ "use_regex": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  },
71
  "decoder": {
72
  "type": "ByteLevel",
 
82
  "end_of_word_suffix": "",
83
  "fuse_unk": false,
84
  "byte_fallback": false,
85
+ "ignore_merges": false,
86
  "vocab": {
87
  "!": 0,
88
  "\"": 1,
tokenizer_config.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "151643": {
6
  "content": "<|endoftext|>",
@@ -39,7 +38,7 @@
39
  "<|im_start|>",
40
  "<|im_end|>"
41
  ],
42
- "bos_token": "<|im_start|>",
43
  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
44
  "clean_up_tokenization_spaces": false,
45
  "eos_token": "<|im_end|>",
@@ -47,9 +46,7 @@
47
  "model_max_length": 32768,
48
  "pad_token": "<|endoftext|>",
49
  "padding_side": "right",
50
- "processor_class": "LlavaProcessor",
51
  "split_special_tokens": false,
52
- "tokenizer_class": "LlamaTokenizer",
53
- "unk_token": null,
54
- "use_default_system_prompt": false
55
  }
 
1
  {
2
+ "add_prefix_space": false,
 
3
  "added_tokens_decoder": {
4
  "151643": {
5
  "content": "<|endoftext|>",
 
38
  "<|im_start|>",
39
  "<|im_end|>"
40
  ],
41
+ "bos_token": null,
42
  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
43
  "clean_up_tokenization_spaces": false,
44
  "eos_token": "<|im_end|>",
 
46
  "model_max_length": 32768,
47
  "pad_token": "<|endoftext|>",
48
  "padding_side": "right",
 
49
  "split_special_tokens": false,
50
+ "tokenizer_class": "Qwen2Tokenizer",
51
+ "unk_token": null
 
52
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff