Training done
Browse files- added_tokens.json +27 -0
- preprocessor_config.json +26 -0
- special_tokens_map.json +8 -0
- tokenizer.json +0 -0
- tokenizer_config.json +18 -0
- vocab.txt +0 -0
added_tokens.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</s>": 32000,
|
3 |
+
"<s_cord-v2>": 32024,
|
4 |
+
"D.": 32004,
|
5 |
+
"D.M.": 32006,
|
6 |
+
"VHOLA": 32022,
|
7 |
+
"কুষ্টিয়া": 32001,
|
8 |
+
"কূষ্টীয়া": 32021,
|
9 |
+
"খূলণা": 32014,
|
10 |
+
"গোপালগঞ্জ-ঘ": 32020,
|
11 |
+
"চুয়াডাংগা": 32007,
|
12 |
+
"চুয়াডাঙ্গা": 32023,
|
13 |
+
"জশর": 32017,
|
14 |
+
"জয়পুরহাট": 32012,
|
15 |
+
"ঝিনাইদাহ": 32015,
|
16 |
+
"টাঙাইল": 32016,
|
17 |
+
"ঢাকক": 32018,
|
18 |
+
"নারায়ণগঞ্জ": 32003,
|
19 |
+
"নড়াইল": 32019,
|
20 |
+
"ফেণী": 32005,
|
21 |
+
"বগুড়া": 32013,
|
22 |
+
"ভোলা-ট": 32011,
|
23 |
+
"ময়মনসিংহ": 32002,
|
24 |
+
"রাজবাড়ী": 32010,
|
25 |
+
"শরীয়তপুর": 32008,
|
26 |
+
"সাতক্ষিরা": 32009
|
27 |
+
}
|
preprocessor_config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_align_long_axis": false,
|
3 |
+
"do_normalize": true,
|
4 |
+
"do_pad": true,
|
5 |
+
"do_rescale": true,
|
6 |
+
"do_resize": true,
|
7 |
+
"do_thumbnail": true,
|
8 |
+
"image_mean": [
|
9 |
+
0.5,
|
10 |
+
0.5,
|
11 |
+
0.5
|
12 |
+
],
|
13 |
+
"image_processor_type": "DonutImageProcessor",
|
14 |
+
"image_std": [
|
15 |
+
0.5,
|
16 |
+
0.5,
|
17 |
+
0.5
|
18 |
+
],
|
19 |
+
"processor_class": "DonutProcessor",
|
20 |
+
"resample": 2,
|
21 |
+
"rescale_factor": 0.00392156862745098,
|
22 |
+
"size": [
|
23 |
+
640,
|
24 |
+
320
|
25 |
+
]
|
26 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"mask_token": "[MASK]",
|
5 |
+
"pad_token": "[PAD]",
|
6 |
+
"sep_token": "[SEP]",
|
7 |
+
"unk_token": "[UNK]"
|
8 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": false,
|
5 |
+
"full_tokenizer_file": null,
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"model_max_length": 1000000000000000019884624838656,
|
8 |
+
"name_or_path": "csebuetnlp/banglabert_large",
|
9 |
+
"never_split": null,
|
10 |
+
"pad_token": "[PAD]",
|
11 |
+
"processor_class": "DonutProcessor",
|
12 |
+
"sep_token": "[SEP]",
|
13 |
+
"special_tokens_map_file": null,
|
14 |
+
"strip_accents": null,
|
15 |
+
"tokenize_chinese_chars": false,
|
16 |
+
"tokenizer_class": "ElectraTokenizer",
|
17 |
+
"unk_token": "[UNK]"
|
18 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|