Tanor commited on
Commit
1c0297a
1 Parent(s): 4d0a4b1

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sr_ner_tesla_j355-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
+ transformer/model filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - spacy
4
+ - token-classification
5
+ language:
6
+ - sr
7
+ license: cc-by-sa-3.0
8
+ model-index:
9
+ - name: sr_ner_tesla_j355
10
+ results:
11
+ - task:
12
+ name: NER
13
+ type: token-classification
14
+ metrics:
15
+ - name: NER Precision
16
+ type: precision
17
+ value: 0.9505642128
18
+ - name: NER Recall
19
+ type: recall
20
+ value: 0.957380598
21
+ - name: NER F Score
22
+ type: f_score
23
+ value: 0.9539602292
24
+ ---
25
+ sr_ner_tesla_j355 is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on Jerteh-355, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.
26
+
27
+ | Feature | Description |
28
+ | --- | --- |
29
+ | **Name** | `sr_ner_tesla_j355` |
30
+ | **Version** | `1.0.0` |
31
+ | **spaCy** | `>=3.7.2,<3.8.0` |
32
+ | **Default Pipeline** | `transformer`, `ner` |
33
+ | **Components** | `transformer`, `ner` |
34
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
35
+ | **Sources** | n/a |
36
+ | **License** | `CC BY-SA 3.0` |
37
+ | **Author** | [Milica Ikonić Nešić, Saša Petalinkar, Mihailo Škorić, Ranka Stanković](https://tesla.rgf.bg.ac.rs/) |
38
+
39
+ ### Label Scheme
40
+
41
+ <details>
42
+
43
+ <summary>View label scheme (7 labels for 1 components)</summary>
44
+
45
+ | Component | Labels |
46
+ | --- | --- |
47
+ | **`ner`** | `DEMO`, `EVENT`, `LOC`, `ORG`, `PERS`, `ROLE`, `WORK` |
48
+
49
+ </details>
50
+
51
+ ### Accuracy
52
+
53
+ | Type | Score |
54
+ | --- | --- |
55
+ | `ENTS_F` | 95.40 |
56
+ | `ENTS_P` | 95.06 |
57
+ | `ENTS_R` | 95.74 |
58
+ | `TRANSFORMER_LOSS` | 137907.16 |
59
+ | `NER_LOSS` | 265590.02 |
config.cfg ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "./train.spacy"
3
+ dev = "./dev.spacy"
4
+ vectors = null
5
+ bert = "E:\\scratch2lm\\bert modeli\\jerteh-355"
6
+ init_tok2vec = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+ seed = 0
11
+
12
+ [nlp]
13
+ lang = "sr"
14
+ pipeline = ["transformer","ner"]
15
+ batch_size = 128
16
+ disabled = []
17
+ before_creation = null
18
+ after_creation = null
19
+ after_pipeline_creation = null
20
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
21
+ vectors = {"@vectors":"spacy.Vectors.v1"}
22
+
23
+ [components]
24
+
25
+ [components.ner]
26
+ factory = "ner"
27
+ incorrect_spans_key = null
28
+ moves = null
29
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
30
+ update_with_oracle_cut_size = 100
31
+
32
+ [components.ner.model]
33
+ @architectures = "spacy.TransitionBasedParser.v2"
34
+ state_type = "ner"
35
+ extra_state_tokens = false
36
+ hidden_width = 64
37
+ maxout_pieces = 2
38
+ use_upper = false
39
+ nO = null
40
+
41
+ [components.ner.model.tok2vec]
42
+ @architectures = "spacy-transformers.TransformerListener.v1"
43
+ grad_factor = 1.0
44
+ pooling = {"@layers":"reduce_mean.v1"}
45
+ upstream = "*"
46
+
47
+ [components.transformer]
48
+ factory = "transformer"
49
+ max_batch_items = 4096
50
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
51
+
52
+ [components.transformer.model]
53
+ @architectures = "spacy-transformers.TransformerModel.v3"
54
+ name = ${paths.bert}
55
+ mixed_precision = false
56
+
57
+ [components.transformer.model.get_spans]
58
+ @span_getters = "spacy-transformers.strided_spans.v1"
59
+ window = 128
60
+ stride = 96
61
+
62
+ [components.transformer.model.grad_scaler_config]
63
+
64
+ [components.transformer.model.tokenizer_config]
65
+ use_fast = true
66
+
67
+ [components.transformer.model.transformer_config]
68
+
69
+ [corpora]
70
+
71
+ [corpora.dev]
72
+ @readers = "spacy.Corpus.v1"
73
+ path = ${paths.dev}
74
+ max_length = 0
75
+ gold_preproc = false
76
+ limit = 0
77
+ augmenter = null
78
+
79
+ [corpora.train]
80
+ @readers = "spacy.Corpus.v1"
81
+ path = ${paths.train}
82
+ max_length = 0
83
+ gold_preproc = false
84
+ limit = 0
85
+ augmenter = null
86
+
87
+ [training]
88
+ accumulate_gradient = 3
89
+ dev_corpus = "corpora.dev"
90
+ train_corpus = "corpora.train"
91
+ seed = ${system.seed}
92
+ gpu_allocator = ${system.gpu_allocator}
93
+ dropout = 0.1
94
+ patience = 1600
95
+ max_epochs = 0
96
+ max_steps = 20000
97
+ eval_frequency = 200
98
+ frozen_components = []
99
+ annotating_components = []
100
+ before_to_disk = null
101
+ before_update = null
102
+
103
+ [training.batcher]
104
+ @batchers = "spacy.batch_by_padded.v1"
105
+ discard_oversize = true
106
+ size = 2000
107
+ buffer = 256
108
+ get_length = null
109
+
110
+ [training.logger]
111
+ @loggers = "spacy.ConsoleLogger.v1"
112
+ progress_bar = false
113
+
114
+ [training.optimizer]
115
+ @optimizers = "Adam.v1"
116
+ beta1 = 0.9
117
+ beta2 = 0.999
118
+ L2_is_weight_decay = true
119
+ L2 = 0.01
120
+ grad_clip = 1.0
121
+ use_averages = false
122
+ eps = 0.00000001
123
+
124
+ [training.optimizer.learn_rate]
125
+ @schedules = "warmup_linear.v1"
126
+ warmup_steps = 250
127
+ total_steps = 20000
128
+ initial_rate = 0.00005
129
+
130
+ [training.score_weights]
131
+ ents_f = 1.0
132
+ ents_p = 0.0
133
+ ents_r = 0.0
134
+ ents_per_type = null
135
+
136
+ [pretraining]
137
+
138
+ [initialize]
139
+ vectors = ${paths.vectors}
140
+ init_tok2vec = ${paths.init_tok2vec}
141
+ vocab_data = null
142
+ lookups = null
143
+ before_init = null
144
+ after_init = null
145
+
146
+ [initialize.components]
147
+
148
+ [initialize.tokenizer]
meta.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"sr",
3
+ "name":"ner_tesla_j355",
4
+ "version":"1.0.0",
5
+ "description":"sr_ner_tesla_j355 is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on Jerteh-355, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.",
6
+ "author":"Milica Ikoni\u0107 Ne\u0161i\u0107, Sa\u0161a Petalinkar, Mihailo \u0160kori\u0107, Ranka Stankovi\u0107",
7
+ "email":"",
8
+ "url":"https://tesla.rgf.bg.ac.rs/",
9
+ "license":"CC BY-SA 3.0",
10
+ "spacy_version":">=3.7.2,<3.8.0",
11
+ "spacy_git_version":"a89eae928",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null
17
+ },
18
+ "labels":{
19
+ "transformer":[
20
+
21
+ ],
22
+ "ner":[
23
+ "DEMO",
24
+ "EVENT",
25
+ "LOC",
26
+ "ORG",
27
+ "PERS",
28
+ "ROLE",
29
+ "WORK"
30
+ ]
31
+ },
32
+ "pipeline":[
33
+ "transformer",
34
+ "ner"
35
+ ],
36
+ "components":[
37
+ "transformer",
38
+ "ner"
39
+ ],
40
+ "disabled":[
41
+
42
+ ],
43
+ "performance":{
44
+ "ents_f":0.9539602292,
45
+ "ents_p":0.9505642128,
46
+ "ents_r":0.957380598,
47
+ "ents_per_type":{
48
+ "ROLE":{
49
+ "p":0.8676470588,
50
+ "r":0.8910045962,
51
+ "f":0.8791707159
52
+ },
53
+ "PERS":{
54
+ "p":0.9847861329,
55
+ "r":0.987742339,
56
+ "f":0.9862620207
57
+ },
58
+ "LOC":{
59
+ "p":0.9486266531,
60
+ "r":0.9831312599,
61
+ "f":0.9655707999
62
+ },
63
+ "DEMO":{
64
+ "p":0.8775510204,
65
+ "r":0.9480314961,
66
+ "f":0.9114307343
67
+ },
68
+ "ORG":{
69
+ "p":0.8499184339,
70
+ "r":0.7117486339,
71
+ "f":0.7747211896
72
+ },
73
+ "WORK":{
74
+ "p":0.5681818182,
75
+ "r":0.3521126761,
76
+ "f":0.4347826087
77
+ },
78
+ "EVENT":{
79
+ "p":0.6666666667,
80
+ "r":0.625,
81
+ "f":0.6451612903
82
+ }
83
+ },
84
+ "transformer_loss":1379.0715721975,
85
+ "ner_loss":2655.900216832
86
+ },
87
+ "requirements":[
88
+ "spacy-transformers>=1.3.4,<1.4.0"
89
+ ]
90
+ }
ner/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":1,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
ner/model ADDED
Binary file (310 kB). View file
 
ner/moves ADDED
@@ -0,0 +1 @@
 
 
1
+ ��moves��{"0":{},"1":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"2":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"3":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"4":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546,"":1},"5":{"":1}}�cfg��neg_key�
sr_ner_tesla_j355-any-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afa9686ab60f4cf2195c6086f1dff37ed328569665d3d13d04294c0a4b5115fe
3
+ size 1308968653
tokenizer ADDED
Binary file (32.6 kB). View file
 
transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2dcb4a64f6bf1d3dedfb96eb233545c4550265f36d9faecc005bf1fef34974a
3
+ size 1420537659
vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
vocab/strings.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/vectors ADDED
Binary file (128 Bytes). View file
 
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }