MLADI / app.py
AMR-KELEG's picture
Compute the Evaluation Metrics
48a308f
raw
history blame
3.89 kB
# TODO: requirments.txt
import os
import numpy as np
import pandas as pd
import streamlit as st
import torch
import datasets
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
model_name = st.text_input("Enter a model's name on HF")
# MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
DIALECTS = [
"Algeria",
"Bahrain",
"Egypt",
"Iraq",
"Jordan",
"Kuwait",
"Lebanon",
"Libya",
"Morocco",
"Oman",
"Palestine",
"Qatar",
"Saudi_Arabia",
"Sudan",
"Syria",
"Tunisia",
"UAE",
"Yemen",
]
assert len(DIALECTS) == 18
DIALECTS_WITH_LABELS = [
"Algeria",
"Egypt",
"Iraq",
"Jordan",
"Morocco",
"Palestine",
"Saudi_Arabia",
"Sudan",
"Syria",
"Tunisia",
"Yemen",
]
assert len(DIALECTS_WITH_LABELS) == 11
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
def predict_top_p(text, P=0.9):
"""Predict the top dialects with an accumulative confidence of at least P."""
assert P <= 1 and P >= 0
logits = model(**tokenizer(text, return_tensors="pt")).logits
probabilities = torch.softmax(logits, dim=1).flatten().tolist()
topk_predictions = torch.topk(logits, 18).indices.flatten().tolist()
predictions = [0 for _ in range(18)]
total_prob = 0
for i in range(18):
total_prob += probabilities[topk_predictions[i]]
predictions[topk_predictions[i]] = 1
if total_prob >= P:
break
return [
predictions[i]
for i, dialect in enumerate(DIALECTS)
if dialect in DIALECTS_WITH_LABELS
]
return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
# Load the dataset
dataset_name = "AMR-KELEG/test-dataset"
dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
sentences_labels, sentences_predictions = [], []
for sample in tqdm(dataset):
text = sample["sentence"]
labels = [
1 if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1 else 0
for i in range(len(DIALECTS))
]
pred = predict_top_p(text)
sentences_labels.append(labels)
sentences_predictions.append(pred)
st.table(
data=pd.DataFrame(
{
"text": dataset["sentence"],
"labels": sentences_labels,
"predictions": sentences_predictions,
}
)
)
gold_matrix = np.array(sentences_labels)
prediction_matrix = np.array(sentences_predictions)
# Compute the scores for each label (country) on its own
accuracy_scores = [
accuracy_score(y_true=gold_matrix[:, i], y_pred=prediction_matrix[:, i]) * 100
for i in range(gold_matrix.shape[1])
]
precision_scores = [
precision_score(
y_true=gold_matrix[:, i],
y_pred=prediction_matrix[:, i],
average="binary",
pos_label="1",
)
* 100
for i in range(gold_matrix.shape[1])
]
recall_scores = [
recall_score(
y_true=gold_matrix[:, i],
y_pred=prediction_matrix[:, i],
average="binary",
pos_label="1",
)
* 100
for i in range(gold_matrix.shape[1])
]
f1_scores = [
f1_score(
y_true=gold_matrix[:, i],
y_pred=prediction_matrix[:, i],
average="binary",
pos_label="1",
)
* 100
for i in range(gold_matrix.shape[1])
]
# Compute the averaged scores
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(f1_scores)
st.write(f"Average Accuracy: {average_accuracy:.2f}%")
st.write(f"Average Precision: {average_precision:.2f}%")
st.write(f"Average Recall: {average_recall:.2f}%")
st.write(f"Average F1: {average_f1:.2f}%")