Tanor commited on
Commit
f6c4c23
·
verified ·
1 Parent(s): 6bc5972

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sr_ner_tesla_j125-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
+ transformer/model filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - spacy
4
+ - token-classification
5
+ language:
6
+ - sr
7
+ license: cc-by-sa-3.0
8
+ model-index:
9
+ - name: sr_ner_tesla_j125
10
+ results:
11
+ - task:
12
+ name: NER
13
+ type: token-classification
14
+ metrics:
15
+ - name: NER Precision
16
+ type: precision
17
+ value: 0.9490420168
18
+ - name: NER Recall
19
+ type: recall
20
+ value: 0.9550128535
21
+ - name: NER F Score
22
+ type: f_score
23
+ value: 0.9520180733
24
+ ---
25
+ sr_ner_tesla_j125 is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on XLM-R-BERTić, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.
26
+
27
+ | Feature | Description |
28
+ | --- | --- |
29
+ | **Name** | `sr_ner_tesla_j125` |
30
+ | **Version** | `1.0.0` |
31
+ | **spaCy** | `>=3.7.2,<3.8.0` |
32
+ | **Default Pipeline** | `transformer`, `ner` |
33
+ | **Components** | `transformer`, `ner` |
34
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
35
+ | **Sources** | n/a |
36
+ | **License** | `CC BY-SA 3.0` |
37
+ | **Author** | [Milica Ikonić Nešić, Saša Petalinkar, Mihailo Škorić, Ranka Stanković](https://tesla.rgf.bg.ac.rs/) |
38
+
39
+ ### Label Scheme
40
+
41
+ <details>
42
+
43
+ <summary>View label scheme (7 labels for 1 components)</summary>
44
+
45
+ | Component | Labels |
46
+ | --- | --- |
47
+ | **`ner`** | `DEMO`, `EVENT`, `LOC`, `ORG`, `PERS`, `ROLE`, `WORK` |
48
+
49
+ </details>
50
+
51
+ ### Accuracy
52
+
53
+ | Type | Score |
54
+ | --- | --- |
55
+ | `ENTS_F` | 95.20 |
56
+ | `ENTS_P` | 94.90 |
57
+ | `ENTS_R` | 95.50 |
58
+ | `TRANSFORMER_LOSS` | 159576.78 |
59
+ | `NER_LOSS` | 169201.76 |
config.cfg ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "./train.spacy"
3
+ dev = "./dev.spacy"
4
+ vectors = null
5
+ bert = "E:\\scratch2lm\\bert modeli\\jerteh-125"
6
+ init_tok2vec = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+ seed = 0
11
+
12
+ [nlp]
13
+ lang = "sr"
14
+ pipeline = ["transformer","ner"]
15
+ batch_size = 128
16
+ disabled = []
17
+ before_creation = null
18
+ after_creation = null
19
+ after_pipeline_creation = null
20
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
21
+ vectors = {"@vectors":"spacy.Vectors.v1"}
22
+
23
+ [components]
24
+
25
+ [components.ner]
26
+ factory = "ner"
27
+ incorrect_spans_key = null
28
+ moves = null
29
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
30
+ update_with_oracle_cut_size = 100
31
+
32
+ [components.ner.model]
33
+ @architectures = "spacy.TransitionBasedParser.v2"
34
+ state_type = "ner"
35
+ extra_state_tokens = false
36
+ hidden_width = 64
37
+ maxout_pieces = 2
38
+ use_upper = false
39
+ nO = null
40
+
41
+ [components.ner.model.tok2vec]
42
+ @architectures = "spacy-transformers.TransformerListener.v1"
43
+ grad_factor = 1.0
44
+ pooling = {"@layers":"reduce_mean.v1"}
45
+ upstream = "*"
46
+
47
+ [components.transformer]
48
+ factory = "transformer"
49
+ max_batch_items = 4096
50
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
51
+
52
+ [components.transformer.model]
53
+ @architectures = "spacy-transformers.TransformerModel.v3"
54
+ name = ${paths.bert}
55
+ mixed_precision = false
56
+
57
+ [components.transformer.model.get_spans]
58
+ @span_getters = "spacy-transformers.strided_spans.v1"
59
+ window = 128
60
+ stride = 96
61
+
62
+ [components.transformer.model.grad_scaler_config]
63
+
64
+ [components.transformer.model.tokenizer_config]
65
+ use_fast = true
66
+
67
+ [components.transformer.model.transformer_config]
68
+
69
+ [corpora]
70
+
71
+ [corpora.dev]
72
+ @readers = "spacy.Corpus.v1"
73
+ path = ${paths.dev}
74
+ max_length = 0
75
+ gold_preproc = false
76
+ limit = 0
77
+ augmenter = null
78
+
79
+ [corpora.train]
80
+ @readers = "spacy.Corpus.v1"
81
+ path = ${paths.train}
82
+ max_length = 0
83
+ gold_preproc = false
84
+ limit = 0
85
+ augmenter = null
86
+
87
+ [training]
88
+ accumulate_gradient = 3
89
+ dev_corpus = "corpora.dev"
90
+ train_corpus = "corpora.train"
91
+ seed = ${system.seed}
92
+ gpu_allocator = ${system.gpu_allocator}
93
+ dropout = 0.1
94
+ patience = 1600
95
+ max_epochs = 0
96
+ max_steps = 20000
97
+ eval_frequency = 200
98
+ frozen_components = []
99
+ annotating_components = []
100
+ before_to_disk = null
101
+ before_update = null
102
+
103
+ [training.batcher]
104
+ @batchers = "spacy.batch_by_padded.v1"
105
+ discard_oversize = true
106
+ size = 2000
107
+ buffer = 256
108
+ get_length = null
109
+
110
+ [training.logger]
111
+ @loggers = "spacy.ConsoleLogger.v1"
112
+ progress_bar = false
113
+
114
+ [training.optimizer]
115
+ @optimizers = "Adam.v1"
116
+ beta1 = 0.9
117
+ beta2 = 0.999
118
+ L2_is_weight_decay = true
119
+ L2 = 0.01
120
+ grad_clip = 1.0
121
+ use_averages = false
122
+ eps = 0.00000001
123
+
124
+ [training.optimizer.learn_rate]
125
+ @schedules = "warmup_linear.v1"
126
+ warmup_steps = 250
127
+ total_steps = 20000
128
+ initial_rate = 0.00005
129
+
130
+ [training.score_weights]
131
+ ents_f = 1.0
132
+ ents_p = 0.0
133
+ ents_r = 0.0
134
+ ents_per_type = null
135
+
136
+ [pretraining]
137
+
138
+ [initialize]
139
+ vectors = ${paths.vectors}
140
+ init_tok2vec = ${paths.init_tok2vec}
141
+ vocab_data = null
142
+ lookups = null
143
+ before_init = null
144
+ after_init = null
145
+
146
+ [initialize.components]
147
+
148
+ [initialize.tokenizer]
meta.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"sr",
3
+ "name":"ner_tesla_j125",
4
+ "version":"1.0.0",
5
+ "description":"sr_ner_tesla_j125 is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on XLM-R-BERTi\u0107, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.",
6
+ "author":"Milica Ikoni\u0107 Ne\u0161i\u0107, Sa\u0161a Petalinkar, Mihailo \u0160kori\u0107, Ranka Stankovi\u0107",
7
+ "email":"",
8
+ "url":"https://tesla.rgf.bg.ac.rs/",
9
+ "license":"CC BY-SA 3.0",
10
+ "spacy_version":">=3.7.2,<3.8.0",
11
+ "spacy_git_version":"a89eae928",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null
17
+ },
18
+ "labels":{
19
+ "transformer":[
20
+
21
+ ],
22
+ "ner":[
23
+ "DEMO",
24
+ "EVENT",
25
+ "LOC",
26
+ "ORG",
27
+ "PERS",
28
+ "ROLE",
29
+ "WORK"
30
+ ]
31
+ },
32
+ "pipeline":[
33
+ "transformer",
34
+ "ner"
35
+ ],
36
+ "components":[
37
+ "transformer",
38
+ "ner"
39
+ ],
40
+ "disabled":[
41
+
42
+ ],
43
+ "performance":{
44
+ "ents_f":0.9520180733,
45
+ "ents_p":0.9490420168,
46
+ "ents_r":0.9550128535,
47
+ "ents_per_type":{
48
+ "ROLE":{
49
+ "p":0.8601667736,
50
+ "r":0.8804990151,
51
+ "f":0.8702141467
52
+ },
53
+ "PERS":{
54
+ "p":0.9840617607,
55
+ "r":0.988492808,
56
+ "f":0.9862723075
57
+ },
58
+ "LOC":{
59
+ "p":0.9579414838,
60
+ "r":0.9665260938,
61
+ "f":0.9622146418
62
+ },
63
+ "DEMO":{
64
+ "p":0.9058641975,
65
+ "r":0.9244094488,
66
+ "f":0.9150428683
67
+ },
68
+ "ORG":{
69
+ "p":0.773925104,
70
+ "r":0.762295082,
71
+ "f":0.7680660702
72
+ },
73
+ "WORK":{
74
+ "p":0.6724137931,
75
+ "r":0.5492957746,
76
+ "f":0.6046511628
77
+ },
78
+ "EVENT":{
79
+ "p":0.7333333333,
80
+ "r":0.6875,
81
+ "f":0.7096774194
82
+ }
83
+ },
84
+ "transformer_loss":1595.7678469561,
85
+ "ner_loss":1692.0175977019
86
+ },
87
+ "requirements":[
88
+ "spacy-transformers>=1.3.4,<1.4.0"
89
+ ]
90
+ }
ner/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":1,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
ner/model ADDED
Binary file (245 kB). View file
 
ner/moves ADDED
@@ -0,0 +1 @@
 
 
1
+ ��moves��{"0":{},"1":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"2":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"3":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"4":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546,"":1},"5":{"":1}}�cfg��neg_key�
sr_ner_tesla_j125-any-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83ca069717055fcb80436282022493a886ff34a160b6f601347ee787fb05e15a
3
+ size 290393121
tokenizer ADDED
Binary file (32.6 kB). View file
 
transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5afa9d9580be6135cb7c9b1003467fc859eefb7fda8d1a948afe642ed0148b38
3
+ size 328605096
vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
vocab/strings.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/vectors ADDED
Binary file (128 Bytes). View file
 
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }