hieunguyen1053 commited on
Commit
87274fe
1 Parent(s): eecfd03

Upload tokenizer

Browse files
Files changed (4) hide show
  1. README.md +4 -5
  2. special_tokens_map.json +1 -15
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +3 -18
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
- license: apache-2.0
3
  datasets:
4
  - hieunguyen1053/binhvq-news-corpus
5
  language:
6
  - vi
7
  library_name: transformers
 
8
  pipeline_tag: fill-mask
9
-
10
  widget:
11
- - text: "tôi là <mask> viên trường đại học tôn đức thắng"
12
- example_title: "Example 1"
13
- ---
 
1
  ---
 
2
  datasets:
3
  - hieunguyen1053/binhvq-news-corpus
4
  language:
5
  - vi
6
  library_name: transformers
7
+ license: apache-2.0
8
  pipeline_tag: fill-mask
 
9
  widget:
10
+ - text: tôi là <mask> viên trường đại học tôn đức thắng
11
+ example_title: Example 1
12
+ ---
special_tokens_map.json CHANGED
@@ -1,15 +1 @@
1
- {
2
- "bos_token": "<s>",
3
- "cls_token": "<s>",
4
- "eos_token": "</s>",
5
- "mask_token": {
6
- "content": "<mask>",
7
- "lstrip": true,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "pad_token": "<pad>",
13
- "sep_token": "</s>",
14
- "unk_token": "<unk>"
15
- }
 
1
+ {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,21 +1,6 @@
1
  {
2
- "bos_token": "<s>",
3
- "cls_token": "<s>",
4
- "eos_token": "</s>",
5
- "mask_token": {
6
- "__type": "AddedToken",
7
- "content": "<mask>",
8
- "lstrip": true,
9
- "normalized": true,
10
- "rstrip": false,
11
- "single_word": false
12
- },
13
  "model_max_length": 1000000000000000019884624838656,
14
- "name_or_path": "",
15
- "pad_token": "<pad>",
16
- "sep_token": "</s>",
17
- "sp_model_kwargs": {},
18
- "special_tokens_map_file": null,
19
- "tokenizer_class": "BartphoTokenizer",
20
- "unk_token": "<unk>"
21
  }
 
1
  {
2
+ "added_tokens_decoder": {},
3
+ "clean_up_tokenization_spaces": true,
 
 
 
 
 
 
 
 
 
4
  "model_max_length": 1000000000000000019884624838656,
5
+ "tokenizer_class": "PreTrainedTokenizerFast"
 
 
 
 
 
 
6
  }