SharpAI commited on
Commit
efdc58c
1 Parent(s): 27f4872

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +313 -0
  3. tokenizer_config.json +6 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "[end]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 160,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 160
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 234,
16
+ "pad_type_id": 0,
17
+ "pad_token": "[PAD]"
18
+ },
19
+ "added_tokens": [
20
+ {
21
+ "id": 1,
22
+ "content": "[UNK]",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 3,
31
+ "content": "[end]",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 234,
40
+ "content": "[PAD]",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 235,
49
+ "content": "[MASK]",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ },
56
+ {
57
+ "id": 236,
58
+ "content": "[SEP]",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ }
65
+ ],
66
+ "normalizer": null,
67
+ "pre_tokenizer": null,
68
+ "post_processor": null,
69
+ "decoder": null,
70
+ "model": {
71
+ "type": "WordPiece",
72
+ "unk_token": "[UNK]",
73
+ "continuing_subword_prefix": "##",
74
+ "max_input_chars_per_word": 100,
75
+ "vocab": {
76
+ "": 0,
77
+ "[UNK]": 1,
78
+ "[start]": 2,
79
+ "[end]": 3,
80
+ "malicious": 4,
81
+ "benign": 5,
82
+ "alert": 6,
83
+ "server_name": 7,
84
+ "sequence": 8,
85
+ "TLS": 9,
86
+ "cipher": 10,
87
+ "change_cipher_spec": 11,
88
+ "client_key_length": 12,
89
+ "c0": 13,
90
+ "s0": 14,
91
+ "c1": 15,
92
+ "s1": 16,
93
+ "c2": 17,
94
+ "s2": 18,
95
+ "c3": 19,
96
+ "s3": 20,
97
+ "c4": 21,
98
+ "s4": 22,
99
+ "c5": 23,
100
+ "s5": 24,
101
+ "c6": 25,
102
+ "s6": 26,
103
+ "c7": 27,
104
+ "s7": 28,
105
+ "c8": 29,
106
+ "s8": 30,
107
+ "c9": 31,
108
+ "s9": 32,
109
+ "c10": 33,
110
+ "s10": 34,
111
+ "c11": 35,
112
+ "s11": 36,
113
+ "c12": 37,
114
+ "s12": 38,
115
+ "c13": 39,
116
+ "s13": 40,
117
+ "c14": 41,
118
+ "s14": 42,
119
+ "c15": 43,
120
+ "s15": 44,
121
+ "c16": 45,
122
+ "s16": 46,
123
+ "l<1": 47,
124
+ "l:1": 48,
125
+ "l:2": 49,
126
+ "l:3": 50,
127
+ "l:4": 51,
128
+ "l:5": 52,
129
+ "l:6": 53,
130
+ "l:7": 54,
131
+ "l:8": 55,
132
+ "l:9": 56,
133
+ "l:10": 57,
134
+ "l:11": 58,
135
+ "l:12": 59,
136
+ "l:13": 60,
137
+ "l:14": 61,
138
+ "l:15": 62,
139
+ "l:16": 63,
140
+ "l:17": 64,
141
+ "l:18": 65,
142
+ "l:19": 66,
143
+ "l:20": 67,
144
+ "l>20": 68,
145
+ "l>10": 69,
146
+ "a": 70,
147
+ "b": 71,
148
+ "c": 72,
149
+ "d": 73,
150
+ "e": 74,
151
+ "f": 75,
152
+ "g": 76,
153
+ "h": 77,
154
+ "i": 78,
155
+ "j": 79,
156
+ "k": 80,
157
+ "l": 81,
158
+ "m": 82,
159
+ "n": 83,
160
+ "o": 84,
161
+ "p": 85,
162
+ "q": 86,
163
+ "r": 87,
164
+ "s": 88,
165
+ "t": 89,
166
+ "u": 90,
167
+ "v": 91,
168
+ "w": 92,
169
+ "x": 93,
170
+ "y": 94,
171
+ "z": 95,
172
+ "0": 96,
173
+ "1": 97,
174
+ "2": 98,
175
+ "3": 99,
176
+ "4": 100,
177
+ "5": 101,
178
+ "6": 102,
179
+ "7": 103,
180
+ "8": 104,
181
+ "9": 105,
182
+ ".": 106,
183
+ "-": 107,
184
+ "SSLv2": 108,
185
+ "SSLv3": 109,
186
+ "TLS1.0": 110,
187
+ "TLS1.1": 111,
188
+ "TLS1.2": 112,
189
+ "TLS1.3": 113,
190
+ "TLS1.3-d18": 114,
191
+ "TLS1.3-d19": 115,
192
+ "TLS_RSA_WITH_RC4_128_MD5": 116,
193
+ "TLS_RSA_WITH_RC4_128_SHA": 117,
194
+ "TLS_RSA_WITH_3DES_EDE_CBC_SHA": 118,
195
+ "TLS_DHE_DSS_WITH_3DES_EDE_CBC_SHA": 119,
196
+ "TLS_DHE_RSA_WITH_DES_CBC_SHA": 120,
197
+ "TLS_RSA_WITH_AES_128_CBC_SHA": 121,
198
+ "TLS_DHE_DSS_WITH_AES_128_CBC_SHA": 122,
199
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA": 123,
200
+ "TLS_RSA_WITH_AES_256_CBC_SHA": 124,
201
+ "TLS_DHE_DSS_WITH_AES_256_CBC_SHA": 125,
202
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA": 126,
203
+ "TLS_RSA_WITH_AES_128_CBC_SHA256": 127,
204
+ "TLS_RSA_WITH_AES_256_CBC_SHA256": 128,
205
+ "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256": 129,
206
+ "TLS_RSA_WITH_CAMELLIA_128_CBC_SHA": 130,
207
+ "TLS_DHE_DSS_WITH_CAMELLIA_128_CBC_SHA": 131,
208
+ "TLS_DHE_RSA_WITH_CAMELLIA_128_CBC_SHA": 132,
209
+ "SSL_RSA_EXPORT1024_WITH_RC4_56_SHA": 133,
210
+ "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256": 134,
211
+ "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256": 135,
212
+ "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256": 136,
213
+ "TLS_RSA_WITH_CAMELLIA_256_CBC_SHA": 137,
214
+ "TLS_DHE_DSS_WITH_CAMELLIA_256_CBC_SHA": 138,
215
+ "TLS_DHE_RSA_WITH_CAMELLIA_256_CBC_SHA": 139,
216
+ "TLS_RSA_WITH_SEED_CBC_SHA": 140,
217
+ "TLS_RSA_WITH_AES_128_GCM_SHA256": 141,
218
+ "TLS_RSA_WITH_AES_256_GCM_SHA384": 142,
219
+ "TLS_DHE_RSA_WITH_AES_128_GCM_SHA256": 143,
220
+ "TLS_DHE_RSA_WITH_AES_256_GCM_SHA384": 144,
221
+ "TLS_DHE_DSS_WITH_AES_128_GCM_SHA256": 145,
222
+ "TLS_DHE_DSS_WITH_AES_256_GCM_SHA384": 146,
223
+ "TLS_DHE_PSK_WITH_AES_128_GCM_SHA256": 147,
224
+ "TLS_DHE_PSK_WITH_AES_256_GCM_SHA384": 148,
225
+ "TLS_AES_128_GCM_SHA256": 149,
226
+ "TLS_AES_256_GCM_SHA384": 150,
227
+ "TLS_CHACHA20_POLY1305_SHA256": 151,
228
+ "TLS_AES_128_CCM_SHA256": 152,
229
+ "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA": 153,
230
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA": 154,
231
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA": 155,
232
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA": 156,
233
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA": 157,
234
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256": 158,
235
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384": 159,
236
+ "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256": 160,
237
+ "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384": 161,
238
+ "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256": 162,
239
+ "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384": 163,
240
+ "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256": 164,
241
+ "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384": 165,
242
+ "TLS_DHE_RSA_WITH_AES_128_CCM": 166,
243
+ "TLS_DHE_RSA_WITH_AES_256_CCM": 167,
244
+ "TLS_DHE_RSA_WITH_AES_128_CCM_8": 168,
245
+ "TLS_DHE_RSA_WITH_AES_256_CCM_8": 169,
246
+ "TLS_DHE_PSK_WITH_AES_128_CCM": 170,
247
+ "TLS_DHE_PSK_WITH_AES_256_CCM": 171,
248
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CCM": 172,
249
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CCM": 173,
250
+ "TLS_ECDHE_ECDSA_WITH_AES_128_CCM_8": 174,
251
+ "TLS_ECDHE_ECDSA_WITH_AES_256_CCM_8": 175,
252
+ "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256": 176,
253
+ "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256": 177,
254
+ "TLS_DHE_RSA_WITH_CHACHA20_POLY1305_SHA256": 178,
255
+ "TLS_ECDHE_PSK_WITH_CHACHA20_POLY1305_SHA256": 179,
256
+ "TLS_DHE_PSK_WITH_CHACHA20_POLY1305_SHA256": 180,
257
+ "TLS_ECDHE_PSK_WITH_AES_128_GCM_SHA256": 181,
258
+ "TLS_ECDHE_PSK_WITH_AES_256_GCM_SHA384": 182,
259
+ "TLS_ECDHE_PSK_WITH_AES_128_CCM_SHA256": 183,
260
+ "FIN": 184,
261
+ "SYN": 185,
262
+ "RST": 186,
263
+ "PSH": 187,
264
+ "ACK": 188,
265
+ "URG": 189,
266
+ "ECE": 190,
267
+ "CWR": 191,
268
+ "ramnit": 192,
269
+ "crthrazy": 193,
270
+ "nymaim": 194,
271
+ "bunitu": 195,
272
+ "zeus": 196,
273
+ "azorult": 197,
274
+ "parite": 198,
275
+ "vawtrak": 199,
276
+ "reposfxg": 200,
277
+ "zeus-panda": 201,
278
+ "gandcrab": 202,
279
+ "bankerx": 203,
280
+ "gootkit": 204,
281
+ "dridex": 205,
282
+ "upatre": 206,
283
+ "qakbot": 207,
284
+ "chthonic": 208,
285
+ "emotet": 209,
286
+ "troldesh": 210,
287
+ "kovter": 211,
288
+ "boleto": 212,
289
+ "hancitor": 213,
290
+ "remcos": 214,
291
+ "trickbot": 215,
292
+ "rig": 216,
293
+ "tofsee": 217,
294
+ "neutrino": 218,
295
+ "icedid": 219,
296
+ "dreambot": 220,
297
+ "miuref": 221,
298
+ "crypt": 222,
299
+ "cerber": 223,
300
+ "unclassified": 224,
301
+ "sigma": 225,
302
+ "spora": 226,
303
+ "locky": 227,
304
+ "fallout": 228,
305
+ "banload": 229,
306
+ "globeimposter": 230,
307
+ "angler": 231,
308
+ "ursnif": 232,
309
+ "?": 233,
310
+ "[PAD]": 234
311
+ }
312
+ }
313
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "max_len": 256,
3
+ "name_or_path": "SharpAI/mal-tls-bert-base",
4
+ "special_tokens_map_file": "/root/.cache/huggingface/transformers/85a76eea59fe40ae80bc50b05c4fe93e7547727086c4a19787726e35a451f9fd.45ed21ffc69cb3eceab51050529cfc4e1b82b5f17027779bf75c6eacc17a5079",
5
+ "tokenizer_class": "PreTrainedTokenizerFast"
6
+ }