Rachel Bawden commited on
Commit
7f2f93b
·
1 Parent(s): c22a4d0

update creative tokeniser

Browse files
Files changed (1) hide show
  1. tokenizer.json +14 -76
tokenizer.json CHANGED
@@ -68,75 +68,7 @@
68
  "replacement": "▁",
69
  "add_prefix_space": true
70
  },
71
- "post_processor": {
72
- "type": "TemplateProcessing",
73
- "single": [
74
- {
75
- "SpecialToken": {
76
- "id": "<s>",
77
- "type_id": 0
78
- }
79
- },
80
- {
81
- "Sequence": {
82
- "id": "A",
83
- "type_id": 0
84
- }
85
- },
86
- {
87
- "SpecialToken": {
88
- "id": "</s>",
89
- "type_id": 0
90
- }
91
- }
92
- ],
93
- "pair": [
94
- {
95
- "SpecialToken": {
96
- "id": "<s>",
97
- "type_id": 0
98
- }
99
- },
100
- {
101
- "Sequence": {
102
- "id": "A",
103
- "type_id": 0
104
- }
105
- },
106
- {
107
- "Sequence": {
108
- "id": "B",
109
- "type_id": 1
110
- }
111
- },
112
- {
113
- "SpecialToken": {
114
- "id": "</s>",
115
- "type_id": 1
116
- }
117
- }
118
- ],
119
- "special_tokens": {
120
- "</s>": {
121
- "id": "</s>",
122
- "ids": [
123
- 2
124
- ],
125
- "tokens": [
126
- "</s>"
127
- ]
128
- },
129
- "<s>": {
130
- "id": "<s>",
131
- "ids": [
132
- 1
133
- ],
134
- "tokens": [
135
- "<s>"
136
- ]
137
- }
138
- }
139
- },
140
  "decoder": {
141
  "type": "Metaspace",
142
  "replacement": "▁",
@@ -8112,12 +8044,18 @@
8112
  "▁précé": 7959,
8113
  "▁satis": 7960,
8114
  "▁unilat": 7961,
8115
- "<": 7962,
8116
- "<t": 7963,
8117
- "▁<t": 7964,
8118
- "▁<t>": 7967,
8119
- "</": 7965,
8120
- "</s": 7966
 
 
 
 
 
 
8121
  },
8122
  "merges": [
8123
  "▁ d",
@@ -21716,4 +21654,4 @@
21716
  "</s >"
21717
  ]
21718
  }
21719
- }
 
68
  "replacement": "▁",
69
  "add_prefix_space": true
70
  },
71
+ "post_processor": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  "decoder": {
73
  "type": "Metaspace",
74
  "replacement": "▁",
 
8044
  "▁précé": 7959,
8045
  "▁satis": 7960,
8046
  "▁unilat": 7961,
8047
+ "madeupword0000": 7962,
8048
+ "madeupword0001": 7963,
8049
+ "madeupword0002": 7964,
8050
+ "madeupword0003": 7965,
8051
+ "madeupword0004": 7966,
8052
+ "madeupword0005": 7967,
8053
+ "<": 7968,
8054
+ "<t": 7969,
8055
+ "▁<t": 7970,
8056
+ "▁<t>": 7971,
8057
+ "</": 7972,
8058
+ "</s": 7973
8059
  },
8060
  "merges": [
8061
  "▁ d",
 
21654
  "</s >"
21655
  ]
21656
  }
21657
+ }