|
|
|
import os |
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
|
|
import torch |
|
import datasets |
|
from tqdm import tqdm |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score |
|
|
|
model_name = st.text_input("Enter a model's name on HF") |
|
|
|
DIALECTS = [ |
|
"Algeria", |
|
"Bahrain", |
|
"Egypt", |
|
"Iraq", |
|
"Jordan", |
|
"Kuwait", |
|
"Lebanon", |
|
"Libya", |
|
"Morocco", |
|
"Oman", |
|
"Palestine", |
|
"Qatar", |
|
"Saudi_Arabia", |
|
"Sudan", |
|
"Syria", |
|
"Tunisia", |
|
"UAE", |
|
"Yemen", |
|
] |
|
assert len(DIALECTS) == 18 |
|
|
|
DIALECTS_WITH_LABELS = [ |
|
"Algeria", |
|
"Egypt", |
|
"Iraq", |
|
"Jordan", |
|
"Morocco", |
|
"Palestine", |
|
"Saudi_Arabia", |
|
"Sudan", |
|
"Syria", |
|
"Tunisia", |
|
"Yemen", |
|
] |
|
assert len(DIALECTS_WITH_LABELS) == 11 |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
|
|
|
|
def predict_top_p(text, P=0.9): |
|
"""Predict the top dialects with an accumulative confidence of at least P.""" |
|
assert P <= 1 and P >= 0 |
|
|
|
logits = model(**tokenizer(text, return_tensors="pt")).logits |
|
probabilities = torch.softmax(logits, dim=1).flatten().tolist() |
|
topk_predictions = torch.topk(logits, 18).indices.flatten().tolist() |
|
|
|
predictions = [0 for _ in range(18)] |
|
total_prob = 0 |
|
|
|
for i in range(18): |
|
total_prob += probabilities[topk_predictions[i]] |
|
predictions[topk_predictions[i]] = 1 |
|
if total_prob >= P: |
|
break |
|
|
|
return [ |
|
predictions[i] |
|
for i, dialect in enumerate(DIALECTS) |
|
if dialect in DIALECTS_WITH_LABELS |
|
] |
|
return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1] |
|
|
|
|
|
|
|
dataset_name = "AMR-KELEG/test-dataset" |
|
dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"] |
|
|
|
sentences_labels, sentences_predictions = [], [] |
|
|
|
for sample in tqdm(dataset): |
|
text = sample["sentence"] |
|
labels = [ |
|
1 if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1 else 0 |
|
for i in range(len(DIALECTS)) |
|
] |
|
pred = predict_top_p(text) |
|
sentences_labels.append(labels) |
|
sentences_predictions.append(pred) |
|
|
|
st.table( |
|
data=pd.DataFrame( |
|
{ |
|
"text": dataset["sentence"], |
|
"labels": sentences_labels, |
|
"predictions": sentences_predictions, |
|
} |
|
) |
|
) |
|
|
|
gold_matrix = np.array(sentences_labels) |
|
prediction_matrix = np.array(sentences_predictions) |
|
|
|
|
|
accuracy_scores = [ |
|
accuracy_score(y_true=gold_matrix[:, i], y_pred=prediction_matrix[:, i]) * 100 |
|
for i in range(gold_matrix.shape[1]) |
|
] |
|
precision_scores = [ |
|
precision_score( |
|
y_true=gold_matrix[:, i], |
|
y_pred=prediction_matrix[:, i], |
|
average="binary", |
|
pos_label="1", |
|
) |
|
* 100 |
|
for i in range(gold_matrix.shape[1]) |
|
] |
|
recall_scores = [ |
|
recall_score( |
|
y_true=gold_matrix[:, i], |
|
y_pred=prediction_matrix[:, i], |
|
average="binary", |
|
pos_label="1", |
|
) |
|
* 100 |
|
for i in range(gold_matrix.shape[1]) |
|
] |
|
f1_scores = [ |
|
f1_score( |
|
y_true=gold_matrix[:, i], |
|
y_pred=prediction_matrix[:, i], |
|
average="binary", |
|
pos_label="1", |
|
) |
|
* 100 |
|
for i in range(gold_matrix.shape[1]) |
|
] |
|
|
|
|
|
average_accuracy = np.mean(accuracy_scores) |
|
average_precision = np.mean(precision_scores) |
|
average_recall = np.mean(recall_scores) |
|
average_f1 = np.mean(f1_scores) |
|
|
|
st.write(f"Average Accuracy: {average_accuracy:.2f}%") |
|
st.write(f"Average Precision: {average_precision:.2f}%") |
|
st.write(f"Average Recall: {average_recall:.2f}%") |
|
st.write(f"Average F1: {average_f1:.2f}%") |
|
|