Upload tokenizer

Files changed (6) hide show

added_tokens.json ADDED Viewed

+{
+  "<image>": 151646,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json CHANGED Viewed

@@ -3,13 +3,6 @@
     "<|im_start|>",
     "<|im_end|>"
   ],
-  "bos_token": {
-    "content": "<|im_start|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "eos_token": {
     "content": "<|im_end|>",
     "lstrip": false,

     "<|im_start|>",
     "<|im_end|>"
   ],
   "eos_token": {
     "content": "<|im_end|>",
     "lstrip": false,

tokenizer.json CHANGED Viewed

@@ -63,58 +63,10 @@
     ]
   },
   "post_processor": {
-    "type": "TemplateProcessing",
-    "single": [
-      {
-        "SpecialToken": {
-          "id": "<|im_start|>",
-          "type_id": 0
-        }
-      },
-      {
-        "Sequence": {
-          "id": "A",
-          "type_id": 0
-        }
-      }
-    ],
-    "pair": [
-      {
-        "SpecialToken": {
-          "id": "<|im_start|>",
-          "type_id": 0
-        }
-      },
-      {
-        "Sequence": {
-          "id": "A",
-          "type_id": 0
-        }
-      },
-      {
-        "SpecialToken": {
-          "id": "<|im_start|>",
-          "type_id": 1
-        }
-      },
-      {
-        "Sequence": {
-          "id": "B",
-          "type_id": 1
-        }
-      }
-    ],
-    "special_tokens": {
-      "<|im_start|>": {
-        "id": "<|im_start|>",
-        "ids": [
-          151644
-        ],
-        "tokens": [
-          "<|im_start|>"
-        ]
-      }
-    }
   },
   "decoder": {
     "type": "ByteLevel",
@@ -130,6 +82,7 @@
     "end_of_word_suffix": "",
     "fuse_unk": false,
     "byte_fallback": false,
     "vocab": {
       "!": 0,
       "\"": 1,

     ]
   },
   "post_processor": {
+    "type": "ByteLevel",
+    "add_prefix_space": false,
+    "trim_offsets": false,
+    "use_regex": false
   },
   "decoder": {
     "type": "ByteLevel",
     "end_of_word_suffix": "",
     "fuse_unk": false,
     "byte_fallback": false,
+    "ignore_merges": false,
     "vocab": {
       "!": 0,
       "\"": 1,

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,5 @@
 {
-  "add_bos_token": true,
-  "add_eos_token": false,
   "added_tokens_decoder": {
     "151643": {
       "content": "<|endoftext|>",
@@ -39,7 +38,7 @@
     "<|im_start|>",
     "<|im_end|>"
   ],
-  "bos_token": "<|im_start|>",
   "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
@@ -47,9 +46,7 @@
   "model_max_length": 32768,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
-  "processor_class": "LlavaProcessor",
   "split_special_tokens": false,
-  "tokenizer_class": "LlamaTokenizer",
-  "unk_token": null,
-  "use_default_system_prompt": false
 }

 {
+  "add_prefix_space": false,
   "added_tokens_decoder": {
     "151643": {
       "content": "<|endoftext|>",
     "<|im_start|>",
     "<|im_end|>"
   ],
+  "bos_token": null,
   "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "model_max_length": 32768,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
 }

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff