|
import pandas as pd |
|
from seqeval.metrics import f1_score, precision_score, recall_score |
|
from transformers import pipeline, AutoTokenizer |
|
from datasets import load_dataset |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"HooshvareLab/albert-fa-zwnj-base-v2-ner") |
|
|
|
dataset = load_dataset('HaniehPoostchi/persian_ner', split='test', trust_remote_code=True) |
|
|
|
|
|
num_to_tags = {0: 'O', |
|
1: 'I-EVE', |
|
2: 'I-FAC', |
|
3: 'I-LOC', |
|
4: 'I-ORG', |
|
5: 'I-PER', |
|
6: 'I-PRO', |
|
7: 'B-EVE', |
|
8: 'B-FAC', |
|
9: 'B-LOC', |
|
10: 'B-ORG', |
|
11: 'B-PER', |
|
12: 'B-PRO'} |
|
|
|
def add_text(examples): |
|
results = {'text': [' '.join(example) for example in examples['tokens']], |
|
|
|
} |
|
return results |
|
|
|
dataset = dataset.map(add_text, batched=True) |
|
dataset = dataset.shuffle(seed=42).select(range(100)) |
|
|
|
pipe = pipeline("token-classification", |
|
model="HooshvareLab/albert-fa-zwnj-base-v2-ner") |
|
|
|
|
|
def predict(example): |
|
tokenized = tokenizer(example['text']) |
|
|
|
words = set(tokenized.word_ids()) |
|
words.remove(None) |
|
words_num = len(words) |
|
|
|
result = pipe(example['text']) |
|
|
|
predictions = ['O'] * words_num |
|
|
|
for entity in result: |
|
word_id = tokenized.token_to_word(entity['index']) |
|
if predictions[word_id] == 'O': |
|
|
|
|
|
|
|
predictions[word_id] = entity['entity'] |
|
return {'predictions': predictions} |
|
|
|
dataset = dataset.map(predict) |
|
|
|
true_labels = [[num_to_tags[tag] for tag in example] for example in dataset['ner_tags']] |
|
|
|
predicted_labels = dataset['predictions'] |
|
|
|
result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2-ner", |
|
'evaluation_dataset': 'HaniehPoostchi/persian_ner', |
|
'Recall': recall_score(true_labels, predicted_labels), |
|
'Precision': precision_score(true_labels, predicted_labels), |
|
'F1': f1_score(true_labels, predicted_labels)} |
|
|
|
result = pd.DataFrame([result]) |
|
|
|
result.to_csv('result.csv', index=False) |