Add SetFit model
Browse files- 1_Pooling/config.json +1 -1
- README.md +62 -62
- config.json +12 -15
- config_sentence_transformers.json +2 -2
- config_setfit.json +2 -2
- model.safetensors +2 -2
- model_head.pkl +2 -2
- modules.json +6 -0
- sentence_bert_config.json +1 -1
- special_tokens_map.json +6 -20
- tokenizer.json +2 -2
- tokenizer_config.json +23 -20
- vocab.txt +0 -0
1_Pooling/config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"word_embedding_dimension":
|
3 |
"pooling_mode_cls_token": false,
|
4 |
"pooling_mode_mean_tokens": true,
|
5 |
"pooling_mode_max_tokens": false,
|
|
|
1 |
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
"pooling_mode_cls_token": false,
|
4 |
"pooling_mode_mean_tokens": true,
|
5 |
"pooling_mode_max_tokens": false,
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
base_model: sentence-transformers/
|
3 |
library_name: setfit
|
4 |
metrics:
|
5 |
-
-
|
6 |
pipeline_tag: text-classification
|
7 |
tags:
|
8 |
- setfit
|
@@ -10,41 +10,27 @@ tags:
|
|
10 |
- text-classification
|
11 |
- generated_from_setfit_trainer
|
12 |
widget:
|
13 |
-
- text:
|
14 |
-
|
15 |
-
|
16 |
-
- text:
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
inference: true
|
29 |
-
model-index:
|
30 |
-
- name: SetFit with sentence-transformers/paraphrase-multilingual-mpnet-base-v2
|
31 |
-
results:
|
32 |
-
- task:
|
33 |
-
type: text-classification
|
34 |
-
name: Text Classification
|
35 |
-
dataset:
|
36 |
-
name: Unknown
|
37 |
-
type: unknown
|
38 |
-
split: test
|
39 |
-
metrics:
|
40 |
-
- type: f1
|
41 |
-
value: 0.7804878048780488
|
42 |
-
name: F1
|
43 |
---
|
44 |
|
45 |
-
# SetFit with sentence-transformers/
|
46 |
|
47 |
-
This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [sentence-transformers/
|
48 |
|
49 |
The model has been trained using an efficient few-shot learning technique that involves:
|
50 |
|
@@ -55,9 +41,9 @@ The model has been trained using an efficient few-shot learning technique that i
|
|
55 |
|
56 |
### Model Description
|
57 |
- **Model Type:** SetFit
|
58 |
-
- **Sentence Transformer body:** [sentence-transformers/
|
59 |
- **Classification head:** a [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance
|
60 |
-
- **Maximum Sequence Length:**
|
61 |
- **Number of Classes:** 2 classes
|
62 |
<!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
|
63 |
<!-- - **Language:** Unknown -->
|
@@ -70,17 +56,10 @@ The model has been trained using an efficient few-shot learning technique that i
|
|
70 |
- **Blogpost:** [SetFit: Efficient Few-Shot Learning Without Prompts](https://huggingface.co/blog/setfit)
|
71 |
|
72 |
### Model Labels
|
73 |
-
| Label | Examples
|
74 |
-
|
75 |
-
| 1 | <ul><li>'
|
76 |
-
| 0 | <ul><li>'
|
77 |
-
|
78 |
-
## Evaluation
|
79 |
-
|
80 |
-
### Metrics
|
81 |
-
| Label | F1 |
|
82 |
-
|:--------|:-------|
|
83 |
-
| **all** | 0.7805 |
|
84 |
|
85 |
## Uses
|
86 |
|
@@ -98,9 +77,9 @@ Then you can load this model and run inference.
|
|
98 |
from setfit import SetFitModel
|
99 |
|
100 |
# Download from the 🤗 Hub
|
101 |
-
model = SetFitModel.from_pretrained("
|
102 |
# Run inference
|
103 |
-
preds = model("
|
104 |
```
|
105 |
|
106 |
<!--
|
@@ -132,12 +111,12 @@ preds = model("Ils auraient menacé la femme d’un VDP et fouiller leur avant d
|
|
132 |
### Training Set Metrics
|
133 |
| Training set | Min | Median | Max |
|
134 |
|:-------------|:----|:--------|:----|
|
135 |
-
| Word count |
|
136 |
|
137 |
| Label | Training Sample Count |
|
138 |
|:------|:----------------------|
|
139 |
-
| 0 |
|
140 |
-
| 1 |
|
141 |
|
142 |
### Training Hyperparameters
|
143 |
- batch_size: (32, 32)
|
@@ -161,21 +140,42 @@ preds = model("Ils auraient menacé la femme d’un VDP et fouiller leur avant d
|
|
161 |
### Training Results
|
162 |
| Epoch | Step | Training Loss | Validation Loss |
|
163 |
|:------:|:----:|:-------------:|:---------------:|
|
164 |
-
| 0.
|
165 |
-
| 0.
|
166 |
-
| 0.
|
167 |
-
| 0.
|
168 |
-
| 0.
|
169 |
-
| 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
### Framework Versions
|
172 |
-
- Python: 3.
|
173 |
- SetFit: 1.1.0
|
174 |
- Sentence Transformers: 3.1.1
|
175 |
-
- Transformers: 4.
|
176 |
-
- PyTorch: 2.
|
177 |
-
- Datasets:
|
178 |
-
- Tokenizers: 0.
|
179 |
|
180 |
## Citation
|
181 |
|
|
|
1 |
---
|
2 |
+
base_model: sentence-transformers/all-MiniLM-L6-v2
|
3 |
library_name: setfit
|
4 |
metrics:
|
5 |
+
- accuracy
|
6 |
pipeline_tag: text-classification
|
7 |
tags:
|
8 |
- setfit
|
|
|
10 |
- text-classification
|
11 |
- generated_from_setfit_trainer
|
12 |
widget:
|
13 |
+
- text: Since the early morning of March 21st, 2021, different armed actions have
|
14 |
+
taken place in Venezuela in the state of Apure (Venezuela) that borders the department
|
15 |
+
of Arauca (Colombia).
|
16 |
+
- text: 3 Clear, responsive and inclusive communication and open channels for raising
|
17 |
+
and addressing concerns is paramount at this time, consistent with accountability
|
18 |
+
to affected populations and Age-Gender and Diversity Principles, as is effective
|
19 |
+
and timely tracking and response to rumours.
|
20 |
+
- text: Selon ces PDIs, des parents restés ou retournés au village les auraient informées
|
21 |
+
de l’amélioration de la situation sécuritaire.
|
22 |
+
- text: Market supply has been impacted by significant deterioration of agricultural
|
23 |
+
service roads due to rainfall erosion, limiting any production delivery to consumption
|
24 |
+
centers.
|
25 |
+
- text: Prevention of moderate acute malnutrition activities among children and PLW
|
26 |
+
of households vulnerable to food insecurity during the lean season are also underway
|
27 |
+
and WFP assisted a total of 17,471 children aged 6-23 months and 14,015 PLW.
|
28 |
inference: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
---
|
30 |
|
31 |
+
# SetFit with sentence-transformers/all-MiniLM-L6-v2
|
32 |
|
33 |
+
This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) as the Sentence Transformer embedding model. A [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance is used for classification.
|
34 |
|
35 |
The model has been trained using an efficient few-shot learning technique that involves:
|
36 |
|
|
|
41 |
|
42 |
### Model Description
|
43 |
- **Model Type:** SetFit
|
44 |
+
- **Sentence Transformer body:** [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
45 |
- **Classification head:** a [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance
|
46 |
+
- **Maximum Sequence Length:** 256 tokens
|
47 |
- **Number of Classes:** 2 classes
|
48 |
<!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
|
49 |
<!-- - **Language:** Unknown -->
|
|
|
56 |
- **Blogpost:** [SetFit: Efficient Few-Shot Learning Without Prompts](https://huggingface.co/blog/setfit)
|
57 |
|
58 |
### Model Labels
|
59 |
+
| Label | Examples |
|
60 |
+
|:------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
61 |
+
| 1 | <ul><li>'In January, as part of its advocacy for the protection of civilians and human rights, the United Nations Joint Human Rights Office in the Democratic Republic of the Congo issued two public reports highlighting the upward trend in human rights violations and abuses committed in Ituri and North Kivu by armed groups, as well as by members of the national security and defence forces.'</li><li>'A son indépendance, en 1960, la RDC avait un PIB par habitant de 325 USD et était la deuxième économie la plus industrialisée d’Afrique, après l’Afrique du Sud.'</li><li>"Les populations les plus gravement touchées sont celles qui ont été déplacées, les groupes de réfugiés et de populations rentrées chez elles, les familles d'accueil et les populations victimes de catastrophes naturelles (inondations, glissements de terrain, incendies) ainsi que les ménages dont le chef de famille est une femme."</li></ul> |
|
62 |
+
| 0 | <ul><li>'This may be driven by children’s varying levels of education and their different language skills,'</li><li>'Ce sont des travaux très pénibles qui nuisent à leur santé physique.'</li><li>'Screening and treatment of MAM were enabled for 10,184 children aged 6-59 months and 2,613 PLW.'</li></ul> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
## Uses
|
65 |
|
|
|
77 |
from setfit import SetFitModel
|
78 |
|
79 |
# Download from the 🤗 Hub
|
80 |
+
model = SetFitModel.from_pretrained("setfit_model_id")
|
81 |
# Run inference
|
82 |
+
preds = model("Selon ces PDIs, des parents restés ou retournés au village les auraient informées de l’amélioration de la situation sécuritaire.")
|
83 |
```
|
84 |
|
85 |
<!--
|
|
|
111 |
### Training Set Metrics
|
112 |
| Training set | Min | Median | Max |
|
113 |
|:-------------|:----|:--------|:----|
|
114 |
+
| Word count | 1 | 25.2763 | 95 |
|
115 |
|
116 |
| Label | Training Sample Count |
|
117 |
|:------|:----------------------|
|
118 |
+
| 0 | 295 |
|
119 |
+
| 1 | 313 |
|
120 |
|
121 |
### Training Hyperparameters
|
122 |
- batch_size: (32, 32)
|
|
|
140 |
### Training Results
|
141 |
| Epoch | Step | Training Loss | Validation Loss |
|
142 |
|:------:|:----:|:-------------:|:---------------:|
|
143 |
+
| 0.0008 | 1 | 0.4533 | - |
|
144 |
+
| 0.0376 | 50 | 0.3371 | - |
|
145 |
+
| 0.0752 | 100 | 0.2585 | - |
|
146 |
+
| 0.1128 | 150 | 0.2574 | - |
|
147 |
+
| 0.1504 | 200 | 0.2535 | - |
|
148 |
+
| 0.1880 | 250 | 0.2513 | - |
|
149 |
+
| 0.2256 | 300 | 0.2573 | - |
|
150 |
+
| 0.2632 | 350 | 0.246 | - |
|
151 |
+
| 0.3008 | 400 | 0.2471 | - |
|
152 |
+
| 0.3383 | 450 | 0.247 | - |
|
153 |
+
| 0.3759 | 500 | 0.2348 | - |
|
154 |
+
| 0.4135 | 550 | 0.2165 | - |
|
155 |
+
| 0.4511 | 600 | 0.1911 | - |
|
156 |
+
| 0.4887 | 650 | 0.1402 | - |
|
157 |
+
| 0.5263 | 700 | 0.0865 | - |
|
158 |
+
| 0.5639 | 750 | 0.049 | - |
|
159 |
+
| 0.6015 | 800 | 0.0279 | - |
|
160 |
+
| 0.6391 | 850 | 0.0188 | - |
|
161 |
+
| 0.6767 | 900 | 0.0108 | - |
|
162 |
+
| 0.7143 | 950 | 0.0072 | - |
|
163 |
+
| 0.7519 | 1000 | 0.0051 | - |
|
164 |
+
| 0.7895 | 1050 | 0.0039 | - |
|
165 |
+
| 0.8271 | 1100 | 0.0032 | - |
|
166 |
+
| 0.8647 | 1150 | 0.0039 | - |
|
167 |
+
| 0.9023 | 1200 | 0.0025 | - |
|
168 |
+
| 0.9398 | 1250 | 0.0024 | - |
|
169 |
+
| 0.9774 | 1300 | 0.0023 | - |
|
170 |
|
171 |
### Framework Versions
|
172 |
+
- Python: 3.11.5
|
173 |
- SetFit: 1.1.0
|
174 |
- Sentence Transformers: 3.1.1
|
175 |
+
- Transformers: 4.45.1
|
176 |
+
- PyTorch: 2.1.0
|
177 |
+
- Datasets: 2.17.1
|
178 |
+
- Tokenizers: 0.20.0
|
179 |
|
180 |
## Citation
|
181 |
|
config.json
CHANGED
@@ -1,29 +1,26 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "sentence-transformers/
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
-
"bos_token_id": 0,
|
8 |
"classifier_dropout": null,
|
9 |
-
"eos_token_id": 2,
|
10 |
"gradient_checkpointing": false,
|
11 |
"hidden_act": "gelu",
|
12 |
"hidden_dropout_prob": 0.1,
|
13 |
-
"hidden_size":
|
14 |
"initializer_range": 0.02,
|
15 |
-
"intermediate_size":
|
16 |
-
"layer_norm_eps": 1e-
|
17 |
-
"max_position_embeddings":
|
18 |
-
"model_type": "
|
19 |
"num_attention_heads": 12,
|
20 |
-
"num_hidden_layers":
|
21 |
-
"
|
22 |
-
"pad_token_id": 1,
|
23 |
"position_embedding_type": "absolute",
|
24 |
"torch_dtype": "float32",
|
25 |
-
"transformers_version": "4.
|
26 |
-
"type_vocab_size":
|
27 |
"use_cache": true,
|
28 |
-
"vocab_size":
|
29 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
|
3 |
"architectures": [
|
4 |
+
"BertModel"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
|
|
7 |
"classifier_dropout": null,
|
|
|
8 |
"gradient_checkpointing": false,
|
9 |
"hidden_act": "gelu",
|
10 |
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 384,
|
12 |
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 1536,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 6,
|
19 |
+
"pad_token_id": 0,
|
|
|
20 |
"position_embedding_type": "absolute",
|
21 |
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.45.1",
|
23 |
+
"type_vocab_size": 2,
|
24 |
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
}
|
config_sentence_transformers.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"__version__": {
|
3 |
"sentence_transformers": "3.1.1",
|
4 |
-
"transformers": "4.
|
5 |
-
"pytorch": "2.
|
6 |
},
|
7 |
"prompts": {},
|
8 |
"default_prompt_name": null,
|
|
|
1 |
{
|
2 |
"__version__": {
|
3 |
"sentence_transformers": "3.1.1",
|
4 |
+
"transformers": "4.45.1",
|
5 |
+
"pytorch": "2.1.0"
|
6 |
},
|
7 |
"prompts": {},
|
8 |
"default_prompt_name": null,
|
config_setfit.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"
|
4 |
}
|
|
|
1 |
{
|
2 |
+
"normalize_embeddings": false,
|
3 |
+
"labels": null
|
4 |
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65f71a8359b6303e5d02940c400afd85298b5e4f99c9cb9b263cd1b80a911138
|
3 |
+
size 90864192
|
model_head.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef4b4c7c2956787b60c529eb67e22b32ad749e7ca5673d4abf66dd0d0204a62a
|
3 |
+
size 3935
|
modules.json
CHANGED
@@ -10,5 +10,11 @@
|
|
10 |
"name": "1",
|
11 |
"path": "1_Pooling",
|
12 |
"type": "sentence_transformers.models.Pooling"
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
}
|
14 |
]
|
|
|
10 |
"name": "1",
|
11 |
"path": "1_Pooling",
|
12 |
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
}
|
20 |
]
|
sentence_bert_config.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
{
|
2 |
-
"max_seq_length":
|
3 |
"do_lower_case": false
|
4 |
}
|
|
|
1 |
{
|
2 |
+
"max_seq_length": 256,
|
3 |
"do_lower_case": false
|
4 |
}
|
special_tokens_map.json
CHANGED
@@ -1,48 +1,34 @@
|
|
1 |
{
|
2 |
-
"bos_token": {
|
3 |
-
"content": "<s>",
|
4 |
-
"lstrip": false,
|
5 |
-
"normalized": false,
|
6 |
-
"rstrip": false,
|
7 |
-
"single_word": false
|
8 |
-
},
|
9 |
"cls_token": {
|
10 |
-
"content": "
|
11 |
-
"lstrip": false,
|
12 |
-
"normalized": false,
|
13 |
-
"rstrip": false,
|
14 |
-
"single_word": false
|
15 |
-
},
|
16 |
-
"eos_token": {
|
17 |
-
"content": "</s>",
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
21 |
"single_word": false
|
22 |
},
|
23 |
"mask_token": {
|
24 |
-
"content": "
|
25 |
-
"lstrip":
|
26 |
"normalized": false,
|
27 |
"rstrip": false,
|
28 |
"single_word": false
|
29 |
},
|
30 |
"pad_token": {
|
31 |
-
"content": "
|
32 |
"lstrip": false,
|
33 |
"normalized": false,
|
34 |
"rstrip": false,
|
35 |
"single_word": false
|
36 |
},
|
37 |
"sep_token": {
|
38 |
-
"content": "
|
39 |
"lstrip": false,
|
40 |
"normalized": false,
|
41 |
"rstrip": false,
|
42 |
"single_word": false
|
43 |
},
|
44 |
"unk_token": {
|
45 |
-
"content": "
|
46 |
"lstrip": false,
|
47 |
"normalized": false,
|
48 |
"rstrip": false,
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"lstrip": false,
|
5 |
"normalized": false,
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
},
|
9 |
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
"normalized": false,
|
13 |
"rstrip": false,
|
14 |
"single_word": false
|
15 |
},
|
16 |
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
21 |
"single_word": false
|
22 |
},
|
23 |
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
"lstrip": false,
|
26 |
"normalized": false,
|
27 |
"rstrip": false,
|
28 |
"single_word": false
|
29 |
},
|
30 |
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
"lstrip": false,
|
33 |
"normalized": false,
|
34 |
"rstrip": false,
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:851ca67100d372ca3ae031a6abd168f53489eebfd7d89523f35c5c9b4d372c3c
|
3 |
+
size 711649
|
tokenizer_config.json
CHANGED
@@ -1,61 +1,64 @@
|
|
1 |
{
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
-
"content": "
|
5 |
"lstrip": false,
|
6 |
"normalized": false,
|
7 |
"rstrip": false,
|
8 |
"single_word": false,
|
9 |
"special": true
|
10 |
},
|
11 |
-
"
|
12 |
-
"content": "
|
13 |
"lstrip": false,
|
14 |
"normalized": false,
|
15 |
"rstrip": false,
|
16 |
"single_word": false,
|
17 |
"special": true
|
18 |
},
|
19 |
-
"
|
20 |
-
"content": "
|
21 |
"lstrip": false,
|
22 |
"normalized": false,
|
23 |
"rstrip": false,
|
24 |
"single_word": false,
|
25 |
"special": true
|
26 |
},
|
27 |
-
"
|
28 |
-
"content": "
|
29 |
"lstrip": false,
|
30 |
"normalized": false,
|
31 |
"rstrip": false,
|
32 |
"single_word": false,
|
33 |
"special": true
|
34 |
},
|
35 |
-
"
|
36 |
-
"content": "
|
37 |
-
"lstrip":
|
38 |
"normalized": false,
|
39 |
"rstrip": false,
|
40 |
"single_word": false,
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
44 |
-
"
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"mask_token": "
|
49 |
"max_length": 128,
|
50 |
-
"model_max_length":
|
|
|
51 |
"pad_to_multiple_of": null,
|
52 |
-
"pad_token": "
|
53 |
"pad_token_type_id": 0,
|
54 |
"padding_side": "right",
|
55 |
-
"sep_token": "
|
56 |
"stride": 0,
|
57 |
-
"
|
|
|
|
|
58 |
"truncation_side": "right",
|
59 |
"truncation_strategy": "longest_first",
|
60 |
-
"unk_token": "
|
61 |
}
|
|
|
1 |
{
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
"lstrip": false,
|
6 |
"normalized": false,
|
7 |
"rstrip": false,
|
8 |
"single_word": false,
|
9 |
"special": true
|
10 |
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
"lstrip": false,
|
14 |
"normalized": false,
|
15 |
"rstrip": false,
|
16 |
"single_word": false,
|
17 |
"special": true
|
18 |
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
"lstrip": false,
|
22 |
"normalized": false,
|
23 |
"rstrip": false,
|
24 |
"single_word": false,
|
25 |
"special": true
|
26 |
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
"lstrip": false,
|
30 |
"normalized": false,
|
31 |
"rstrip": false,
|
32 |
"single_word": false,
|
33 |
"special": true
|
34 |
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
"normalized": false,
|
39 |
"rstrip": false,
|
40 |
"single_word": false,
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
44 |
+
"clean_up_tokenization_spaces": false,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
"max_length": 128,
|
50 |
+
"model_max_length": 256,
|
51 |
+
"never_split": null,
|
52 |
"pad_to_multiple_of": null,
|
53 |
+
"pad_token": "[PAD]",
|
54 |
"pad_token_type_id": 0,
|
55 |
"padding_side": "right",
|
56 |
+
"sep_token": "[SEP]",
|
57 |
"stride": 0,
|
58 |
+
"strip_accents": null,
|
59 |
+
"tokenize_chinese_chars": true,
|
60 |
+
"tokenizer_class": "BertTokenizer",
|
61 |
"truncation_side": "right",
|
62 |
"truncation_strategy": "longest_first",
|
63 |
+
"unk_token": "[UNK]"
|
64 |
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|