Montazerh82 commited on
Commit
a029051
·
1 Parent(s): 266ec2a

add evaluate file

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. evaluate.py +73 -0
  3. result.csv +2 -0
app.py CHANGED
@@ -21,7 +21,7 @@ entity_mapping = {'DAT': 'تاریخ',
21
  'LOC': 'مکان',
22
  'MON': 'پول',
23
  'ORG': 'سازمان',
24
- 'PCT': 'شخصیت',
25
  'PER': 'شخص',
26
  'PRO': 'محصول',
27
  'TIM': 'زمان'}
 
21
  'LOC': 'مکان',
22
  'MON': 'پول',
23
  'ORG': 'سازمان',
24
+ 'PCT': 'درصد',
25
  'PER': 'شخص',
26
  'PRO': 'محصول',
27
  'TIM': 'زمان'}
evaluate.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from seqeval.metrics import f1_score, precision_score, recall_score
3
+ from transformers import pipeline, AutoTokenizer
4
+ from datasets import load_dataset
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained(
7
+ "HooshvareLab/albert-fa-zwnj-base-v2-ner")
8
+
9
+ dataset = load_dataset('HaniehPoostchi/persian_ner', split='test', trust_remote_code=True)
10
+
11
+ # tag_to_num = {'O':0, 'I-EVE':1, 'I-FAC':2, 'I-LOC':3, 'I-ORG':4, 'I-PER':5, 'I-PRO':6, 'B-EVE':7, 'B-FAC':8, 'B-LOC':9, 'B-ORG':10, 'B-PER':11, 'B-PRO':12}
12
+ num_to_tags = {0: 'O',
13
+ 1: 'I-EVE',
14
+ 2: 'I-FAC',
15
+ 3: 'I-LOC',
16
+ 4: 'I-ORG',
17
+ 5: 'I-PER',
18
+ 6: 'I-PRO',
19
+ 7: 'B-EVE',
20
+ 8: 'B-FAC',
21
+ 9: 'B-LOC',
22
+ 10: 'B-ORG',
23
+ 11: 'B-PER',
24
+ 12: 'B-PRO'}
25
+
26
+ def add_text(examples):
27
+ results = {'text': [' '.join(example) for example in examples['tokens']],
28
+ # 'ner_tags': [[num_to_tags[tag] for tag in example] for example in examples['ner_tags']]
29
+ }
30
+ return results
31
+
32
+ dataset = dataset.map(add_text, batched=True)
33
+ dataset = dataset.shuffle(seed=42).select(range(100))
34
+
35
+ pipe = pipeline("token-classification",
36
+ model="HooshvareLab/albert-fa-zwnj-base-v2-ner")
37
+
38
+
39
+ def predict(example):
40
+ tokenized = tokenizer(example['text'])
41
+
42
+ words = set(tokenized.word_ids())
43
+ words.remove(None)
44
+ words_num = len(words)
45
+
46
+ result = pipe(example['text'])
47
+
48
+ predictions = ['O'] * words_num
49
+
50
+ for entity in result:
51
+ word_id = tokenized.token_to_word(entity['index'])
52
+ if predictions[word_id] == 'O':
53
+ # if entity['entity'] not in tag_to_num.keys():
54
+ # predictions[word_id] = 1
55
+ # continue
56
+ predictions[word_id] = entity['entity']
57
+ return {'predictions': predictions}
58
+
59
+ dataset = dataset.map(predict)
60
+
61
+ true_labels = [[num_to_tags[tag] for tag in example] for example in dataset['ner_tags']]
62
+ # true_labels = dataset['ner_tags']
63
+ predicted_labels = dataset['predictions']
64
+
65
+ result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2-ner",
66
+ 'evaluation_dataset': 'HaniehPoostchi/persian_ner',
67
+ 'Recall': recall_score(true_labels, predicted_labels),
68
+ 'Precision': precision_score(true_labels, predicted_labels),
69
+ 'F1': f1_score(true_labels, predicted_labels)}
70
+
71
+ result = pd.DataFrame([result])
72
+
73
+ result.to_csv('result.csv', index=False)
result.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ model,evaluation_dataset,Recall,Precision,F1
2
+ HooshvareLab/albert-fa-zwnj-base-v2-ner,HaniehPoostchi/persian_ner,0.9745222929936306,0.9272727272727272,0.9503105590062112