Montazerh82's picture
add evaluate file
a029051
import pandas as pd
from seqeval.metrics import f1_score, precision_score, recall_score
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset
tokenizer = AutoTokenizer.from_pretrained(
"HooshvareLab/albert-fa-zwnj-base-v2-ner")
dataset = load_dataset('HaniehPoostchi/persian_ner', split='test', trust_remote_code=True)
# tag_to_num = {'O':0, 'I-EVE':1, 'I-FAC':2, 'I-LOC':3, 'I-ORG':4, 'I-PER':5, 'I-PRO':6, 'B-EVE':7, 'B-FAC':8, 'B-LOC':9, 'B-ORG':10, 'B-PER':11, 'B-PRO':12}
num_to_tags = {0: 'O',
1: 'I-EVE',
2: 'I-FAC',
3: 'I-LOC',
4: 'I-ORG',
5: 'I-PER',
6: 'I-PRO',
7: 'B-EVE',
8: 'B-FAC',
9: 'B-LOC',
10: 'B-ORG',
11: 'B-PER',
12: 'B-PRO'}
def add_text(examples):
results = {'text': [' '.join(example) for example in examples['tokens']],
# 'ner_tags': [[num_to_tags[tag] for tag in example] for example in examples['ner_tags']]
}
return results
dataset = dataset.map(add_text, batched=True)
dataset = dataset.shuffle(seed=42).select(range(100))
pipe = pipeline("token-classification",
model="HooshvareLab/albert-fa-zwnj-base-v2-ner")
def predict(example):
tokenized = tokenizer(example['text'])
words = set(tokenized.word_ids())
words.remove(None)
words_num = len(words)
result = pipe(example['text'])
predictions = ['O'] * words_num
for entity in result:
word_id = tokenized.token_to_word(entity['index'])
if predictions[word_id] == 'O':
# if entity['entity'] not in tag_to_num.keys():
# predictions[word_id] = 1
# continue
predictions[word_id] = entity['entity']
return {'predictions': predictions}
dataset = dataset.map(predict)
true_labels = [[num_to_tags[tag] for tag in example] for example in dataset['ner_tags']]
# true_labels = dataset['ner_tags']
predicted_labels = dataset['predictions']
result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2-ner",
'evaluation_dataset': 'HaniehPoostchi/persian_ner',
'Recall': recall_score(true_labels, predicted_labels),
'Precision': precision_score(true_labels, predicted_labels),
'F1': f1_score(true_labels, predicted_labels)}
result = pd.DataFrame([result])
result.to_csv('result.csv', index=False)