madriss Danube commited on
Commit
9b0ae01
1 Parent(s): 975bfdf

Upload 10 files (#3)

Browse files

- Upload 10 files (fe8f669d4893191b2983f0c21750a83edfaafdaa)


Co-authored-by: Dany <[email protected]>

audio/ner/camember-ner/.gitattributes ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
18
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
audio/ner/camember-ner/README.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: fr
3
+ datasets:
4
+ - Jean-Baptiste/wikiner_fr
5
+ widget:
6
+ - text: "Je m'appelle jean-baptiste et je vis à montréal"
7
+ - text: "george washington est allé à washington"
8
+ license: mit
9
+ ---
10
+
11
+ # camembert-ner: model fine-tuned from camemBERT for NER task.
12
+
13
+ ## Introduction
14
+
15
+ [camembert-ner] is a NER model that was fine-tuned from camemBERT on wikiner-fr dataset.
16
+ Model was trained on wikiner-fr dataset (~170 634 sentences).
17
+ Model was validated on emails/chat data and overperformed other models on this type of data specifically.
18
+ In particular the model seems to work better on entity that don't start with an upper case.
19
+
20
+ ## Training data
21
+ Training data was classified as follow:
22
+
23
+ Abbreviation|Description
24
+ -|-
25
+ O |Outside of a named entity
26
+ MISC |Miscellaneous entity
27
+ PER |Person’s name
28
+ ORG |Organization
29
+ LOC |Location
30
+
31
+
32
+ ## How to use camembert-ner with HuggingFace
33
+
34
+ ##### Load camembert-ner and its sub-word tokenizer :
35
+
36
+ ```python
37
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
40
+ model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")
41
+
42
+
43
+ ##### Process text sample (from wikipedia)
44
+
45
+ from transformers import pipeline
46
+
47
+ nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
48
+ nlp("Apple est créée le 1er avril 1976 dans le garage de la maison d'enfance de Steve Jobs à Los Altos en Californie par Steve Jobs, Steve Wozniak et Ronald Wayne14, puis constituée sous forme de société le 3 janvier 1977 à l'origine sous le nom d'Apple Computer, mais pour ses 30 ans et pour refléter la diversification de ses produits, le mot « computer » est retiré le 9 janvier 2015.")
49
+
50
+
51
+ [{'entity_group': 'ORG',
52
+ 'score': 0.9472818374633789,
53
+ 'word': 'Apple',
54
+ 'start': 0,
55
+ 'end': 5},
56
+ {'entity_group': 'PER',
57
+ 'score': 0.9838564991950989,
58
+ 'word': 'Steve Jobs',
59
+ 'start': 74,
60
+ 'end': 85},
61
+ {'entity_group': 'LOC',
62
+ 'score': 0.9831605950991312,
63
+ 'word': 'Los Altos',
64
+ 'start': 87,
65
+ 'end': 97},
66
+ {'entity_group': 'LOC',
67
+ 'score': 0.9834540486335754,
68
+ 'word': 'Californie',
69
+ 'start': 100,
70
+ 'end': 111},
71
+ {'entity_group': 'PER',
72
+ 'score': 0.9841555754343668,
73
+ 'word': 'Steve Jobs',
74
+ 'start': 115,
75
+ 'end': 126},
76
+ {'entity_group': 'PER',
77
+ 'score': 0.9843501806259155,
78
+ 'word': 'Steve Wozniak',
79
+ 'start': 127,
80
+ 'end': 141},
81
+ {'entity_group': 'PER',
82
+ 'score': 0.9841533899307251,
83
+ 'word': 'Ronald Wayne',
84
+ 'start': 144,
85
+ 'end': 157},
86
+ {'entity_group': 'ORG',
87
+ 'score': 0.9468960364659628,
88
+ 'word': 'Apple Computer',
89
+ 'start': 243,
90
+ 'end': 257}]
91
+
92
+ ```
93
+
94
+
95
+ ## Model performances (metric: seqeval)
96
+
97
+ Overall
98
+
99
+ precision|recall|f1
100
+ -|-|-
101
+ 0.8859|0.8971|0.8914
102
+
103
+ By entity
104
+
105
+ entity|precision|recall|f1
106
+ -|-|-|-
107
+ PER|0.9372|0.9598|0.9483
108
+ ORG|0.8099|0.8265|0.8181
109
+ LOC|0.8905|0.9005|0.8955
110
+ MISC|0.8175|0.8117|0.8146
111
+
112
+
113
+
114
+
115
+ For those who could be interested, here is a short article on how I used the results of this model to train a LSTM model for signature detection in emails:
116
+ https://medium.com/@jean-baptiste.polle/lstm-model-for-email-signature-detection-8e990384fefa
audio/ner/camember-ner/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "camembert-base",
3
+ "architectures": [
4
+ "CamembertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 5,
8
+ "eos_token_id": 6,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "O",
15
+ "1": "I-LOC",
16
+ "2": "I-PER",
17
+ "3": "I-MISC",
18
+ "4": "I-ORG"
19
+ },
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 3072,
22
+ "label2id": {
23
+ "I-LOC": 1,
24
+ "I-MISC": 3,
25
+ "O": 0,
26
+ "I-ORG": 4,
27
+ "I-PER": 2
28
+ },
29
+ "layer_norm_eps": 1e-05,
30
+ "max_position_embeddings": 514,
31
+ "model_type": "camembert",
32
+ "num_attention_heads": 12,
33
+ "num_hidden_layers": 12,
34
+ "output_past": true,
35
+ "pad_token_id": 1,
36
+ "position_embedding_type": "absolute",
37
+ "transformers_version": "4.3.2",
38
+ "type_vocab_size": 1,
39
+ "use_cache": true,
40
+ "vocab_size": 32005
41
+ }
audio/ner/camember-ner/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9dfa2ec9e0bfad82d606782ba3e0ab94c001c46481514e0658fc20a624845c5
3
+ size 440422178
audio/ner/camember-ner/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:decc811bc764b0e94f1adc16b37a127300201eb8b5e4b733a89185d68b9d81c9
3
+ size 440168896
audio/ner/camember-ner/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9f586c5bc5943992fa49fe0c0c390dace2a48288d1cec0680cd96fcd17ed037
3
+ size 440227047
audio/ner/camember-ner/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:988bc5a00281c6d210a5d34bd143d0363741a432fefe741bf71e61b1869d4314
3
+ size 810912
audio/ner/camember-ner/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>", "additional_special_tokens": ["<s>NOTUSED", "</s>NOTUSED"]}
audio/ner/camember-ner/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>", "additional_special_tokens": ["<s>NOTUSED", "</s>NOTUSED"], "model_max_length": 512, "name_or_path": "camembert-base"}
audio/ner/camember-ner/vocab.txt ADDED
The diff for this file is too large to render. See raw diff