Montazerh82
commited on
Commit
·
a029051
1
Parent(s):
266ec2a
add evaluate file
Browse files- app.py +1 -1
- evaluate.py +73 -0
- result.csv +2 -0
app.py
CHANGED
@@ -21,7 +21,7 @@ entity_mapping = {'DAT': 'تاریخ',
|
|
21 |
'LOC': 'مکان',
|
22 |
'MON': 'پول',
|
23 |
'ORG': 'سازمان',
|
24 |
-
'PCT': '
|
25 |
'PER': 'شخص',
|
26 |
'PRO': 'محصول',
|
27 |
'TIM': 'زمان'}
|
|
|
21 |
'LOC': 'مکان',
|
22 |
'MON': 'پول',
|
23 |
'ORG': 'سازمان',
|
24 |
+
'PCT': 'درصد',
|
25 |
'PER': 'شخص',
|
26 |
'PRO': 'محصول',
|
27 |
'TIM': 'زمان'}
|
evaluate.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from seqeval.metrics import f1_score, precision_score, recall_score
|
3 |
+
from transformers import pipeline, AutoTokenizer
|
4 |
+
from datasets import load_dataset
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
7 |
+
"HooshvareLab/albert-fa-zwnj-base-v2-ner")
|
8 |
+
|
9 |
+
dataset = load_dataset('HaniehPoostchi/persian_ner', split='test', trust_remote_code=True)
|
10 |
+
|
11 |
+
# tag_to_num = {'O':0, 'I-EVE':1, 'I-FAC':2, 'I-LOC':3, 'I-ORG':4, 'I-PER':5, 'I-PRO':6, 'B-EVE':7, 'B-FAC':8, 'B-LOC':9, 'B-ORG':10, 'B-PER':11, 'B-PRO':12}
|
12 |
+
num_to_tags = {0: 'O',
|
13 |
+
1: 'I-EVE',
|
14 |
+
2: 'I-FAC',
|
15 |
+
3: 'I-LOC',
|
16 |
+
4: 'I-ORG',
|
17 |
+
5: 'I-PER',
|
18 |
+
6: 'I-PRO',
|
19 |
+
7: 'B-EVE',
|
20 |
+
8: 'B-FAC',
|
21 |
+
9: 'B-LOC',
|
22 |
+
10: 'B-ORG',
|
23 |
+
11: 'B-PER',
|
24 |
+
12: 'B-PRO'}
|
25 |
+
|
26 |
+
def add_text(examples):
|
27 |
+
results = {'text': [' '.join(example) for example in examples['tokens']],
|
28 |
+
# 'ner_tags': [[num_to_tags[tag] for tag in example] for example in examples['ner_tags']]
|
29 |
+
}
|
30 |
+
return results
|
31 |
+
|
32 |
+
dataset = dataset.map(add_text, batched=True)
|
33 |
+
dataset = dataset.shuffle(seed=42).select(range(100))
|
34 |
+
|
35 |
+
pipe = pipeline("token-classification",
|
36 |
+
model="HooshvareLab/albert-fa-zwnj-base-v2-ner")
|
37 |
+
|
38 |
+
|
39 |
+
def predict(example):
|
40 |
+
tokenized = tokenizer(example['text'])
|
41 |
+
|
42 |
+
words = set(tokenized.word_ids())
|
43 |
+
words.remove(None)
|
44 |
+
words_num = len(words)
|
45 |
+
|
46 |
+
result = pipe(example['text'])
|
47 |
+
|
48 |
+
predictions = ['O'] * words_num
|
49 |
+
|
50 |
+
for entity in result:
|
51 |
+
word_id = tokenized.token_to_word(entity['index'])
|
52 |
+
if predictions[word_id] == 'O':
|
53 |
+
# if entity['entity'] not in tag_to_num.keys():
|
54 |
+
# predictions[word_id] = 1
|
55 |
+
# continue
|
56 |
+
predictions[word_id] = entity['entity']
|
57 |
+
return {'predictions': predictions}
|
58 |
+
|
59 |
+
dataset = dataset.map(predict)
|
60 |
+
|
61 |
+
true_labels = [[num_to_tags[tag] for tag in example] for example in dataset['ner_tags']]
|
62 |
+
# true_labels = dataset['ner_tags']
|
63 |
+
predicted_labels = dataset['predictions']
|
64 |
+
|
65 |
+
result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2-ner",
|
66 |
+
'evaluation_dataset': 'HaniehPoostchi/persian_ner',
|
67 |
+
'Recall': recall_score(true_labels, predicted_labels),
|
68 |
+
'Precision': precision_score(true_labels, predicted_labels),
|
69 |
+
'F1': f1_score(true_labels, predicted_labels)}
|
70 |
+
|
71 |
+
result = pd.DataFrame([result])
|
72 |
+
|
73 |
+
result.to_csv('result.csv', index=False)
|
result.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
model,evaluation_dataset,Recall,Precision,F1
|
2 |
+
HooshvareLab/albert-fa-zwnj-base-v2-ner,HaniehPoostchi/persian_ner,0.9745222929936306,0.9272727272727272,0.9503105590062112
|