Upload tokenizer

Files changed (4) hide show

added_tokens.json ADDED Viewed

+{
+ "[PAD]": 49152
+}

special_tokens_map.json CHANGED Viewed

@@ -32,7 +32,13 @@
  "rstrip": false,
  "single_word": false
  },
- "pad_token": "<|endoftext|>",
  "unk_token": {
  "content": "<|endoftext|>",
  "lstrip": false,

  "rstrip": false,
  "single_word": false
  },
+ "pad_token": {
+ "content": "[PAD]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
  "unk_token": {
  "content": "<|endoftext|>",
  "lstrip": false,

tokenizer.json CHANGED Viewed

@@ -2,11 +2,18 @@
  "version": "1.0",
  "truncation": {
  "direction": "Right",
- "max_length": 512,
  "strategy": "LongestFirst",
  "stride": 0
  },
- "padding": null,
  "added_tokens": [
  {
  "id": 0,
@@ -160,6 +167,15 @@
  "rstrip": false,
  "normalized": false,
  "special": true
  }
  ],
  "normalizer": null,

  "version": "1.0",
  "truncation": {
  "direction": "Right",
+ "max_length": 256,
  "strategy": "LongestFirst",
  "stride": 0
  },
+ "padding": {
+ "strategy": "BatchLongest",
+ "direction": "Left",
+ "pad_to_multiple_of": null,
+ "pad_id": 49152,
+ "pad_type_id": 0,
+ "pad_token": "[PAD]"
+ },
  "added_tokens": [
  {
  "id": 0,
  "rstrip": false,
  "normalized": false,
  "special": true
+ },
+ {
+ "id": 49152,
+ "content": "[PAD]",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
  }
  ],
  "normalizer": null,

tokenizer_config.json CHANGED Viewed

@@ -136,6 +136,14 @@
  "rstrip": false,
  "single_word": false,
  "special": true
  }
  },
  "additional_special_tokens": [
@@ -160,8 +168,8 @@
  "bos_token": "<|endoftext|>",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|endoftext|>",
- "model_max_length": 1000000000000000019884624838656,
- "pad_token": "<|endoftext|>",
  "tokenizer_class": "GPT2Tokenizer",
  "unk_token": "<|endoftext|>",
  "vocab_size": 49152

  "rstrip": false,
  "single_word": false,
  "special": true
+ },
+ "49152": {
+ "content": "[PAD]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
  }
  },
  "additional_special_tokens": [
  "bos_token": "<|endoftext|>",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|endoftext|>",
+ "model_max_length": 256,
+ "pad_token": "[PAD]",
  "tokenizer_class": "GPT2Tokenizer",
  "unk_token": "<|endoftext|>",
  "vocab_size": 49152