ahadda5 commited on
Commit
af0c6c3
1 Parent(s): 4376b0c

add tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +59 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +80 -0
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<present>",
4
+ "<absent>",
5
+ "<category>",
6
+ "<infill>",
7
+ "<seealso>",
8
+ "<header>",
9
+ "<|endoftext|>",
10
+ "<sep>",
11
+ "<mask>",
12
+ "<mixed>",
13
+ "<number>",
14
+ "<phrase>"
15
+ ],
16
+ "bos_token": {
17
+ "content": "<s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "cls_token": {
24
+ "content": "<s>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "eos_token": {
31
+ "content": "</s>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "mask_token": {
38
+ "content": "<mask>",
39
+ "lstrip": true,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "pad_token": {
45
+ "content": "<pad>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ },
51
+ "sep_token": "<sep>",
52
+ "unk_token": {
53
+ "content": "<unk>",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ }
59
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "<present>",
5
+ "<absent>",
6
+ "<category>",
7
+ "<infill>",
8
+ "<seealso>",
9
+ "<header>",
10
+ "<|endoftext|>",
11
+ "<sep>",
12
+ "<mask>",
13
+ "<mixed>",
14
+ "<number>",
15
+ "<phrase>"
16
+ ],
17
+ "bos_token": {
18
+ "__type": "AddedToken",
19
+ "content": "<s>",
20
+ "lstrip": false,
21
+ "normalized": true,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "cls_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<s>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "eos_token": {
34
+ "__type": "AddedToken",
35
+ "content": "</s>",
36
+ "lstrip": false,
37
+ "normalized": true,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ },
41
+ "errors": "replace",
42
+ "mask_token": {
43
+ "__type": "AddedToken",
44
+ "content": "<mask>",
45
+ "lstrip": true,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false
49
+ },
50
+ "model_max_length": 512,
51
+ "name_or_path": "memray/bart_wikikp",
52
+ "pad_token": {
53
+ "__type": "AddedToken",
54
+ "content": "<pad>",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false
59
+ },
60
+ "sep": "<sep>",
61
+ "sep_token": {
62
+ "__type": "AddedToken",
63
+ "content": "<sep>",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false
68
+ },
69
+ "special_tokens_map_file": "/home/ashraf.haddad/.cache/huggingface/transformers/4b7a3619321a39f6780e0c775802e3523e52c1efd3a46e6d1baac9e1e8e234e6.898eb95aac9bb57440c2f57caa963ae18b9b10ba4731cc81020283286b0391fc",
70
+ "tokenizer_class": "BartTokenizer",
71
+ "trim_offsets": true,
72
+ "unk_token": {
73
+ "__type": "AddedToken",
74
+ "content": "<unk>",
75
+ "lstrip": false,
76
+ "normalized": true,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ }
80
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff