Spaces:

AMR-KELEG
/

MLADI

Running

App Files Files Community

MLADI / app.py

AMR-KELEG

Compute the Evaluation Metrics

48a308f 8 months ago

raw

history blame

3.89 kB

	# TODO: requirments.txt
	import os
	import numpy as np
	import pandas as pd
	import streamlit as st

	import torch
	import datasets
	from tqdm import tqdm
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

	model_name = st.text_input("Enter a model's name on HF")
	# MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
	DIALECTS = [
	"Algeria",
	"Bahrain",
	"Egypt",
	"Iraq",
	"Jordan",
	"Kuwait",
	"Lebanon",
	"Libya",
	"Morocco",
	"Oman",
	"Palestine",
	"Qatar",
	"Saudi_Arabia",
	"Sudan",
	"Syria",
	"Tunisia",
	"UAE",
	"Yemen",
	]
	assert len(DIALECTS) == 18

	DIALECTS_WITH_LABELS = [
	"Algeria",
	"Egypt",
	"Iraq",
	"Jordan",
	"Morocco",
	"Palestine",
	"Saudi_Arabia",
	"Sudan",
	"Syria",
	"Tunisia",
	"Yemen",
	]
	assert len(DIALECTS_WITH_LABELS) == 11

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)


	def predict_top_p(text, P=0.9):
	"""Predict the top dialects with an accumulative confidence of at least P."""
	assert P <= 1 and P >= 0

	logits = model(**tokenizer(text, return_tensors="pt")).logits
	probabilities = torch.softmax(logits, dim=1).flatten().tolist()
	topk_predictions = torch.topk(logits, 18).indices.flatten().tolist()

	predictions = [0 for _ in range(18)]
	total_prob = 0

	for i in range(18):
	total_prob += probabilities[topk_predictions[i]]
	predictions[topk_predictions[i]] = 1
	if total_prob >= P:
	break

	return [
	predictions[i]
	for i, dialect in enumerate(DIALECTS)
	if dialect in DIALECTS_WITH_LABELS
	]
	return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]


	# Load the dataset
	dataset_name = "AMR-KELEG/test-dataset"
	dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]

	sentences_labels, sentences_predictions = [], []

	for sample in tqdm(dataset):
	text = sample["sentence"]
	labels = [
	1 if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1 else 0
	for i in range(len(DIALECTS))
	]
	pred = predict_top_p(text)
	sentences_labels.append(labels)
	sentences_predictions.append(pred)

	st.table(
	data=pd.DataFrame(
	{
	"text": dataset["sentence"],
	"labels": sentences_labels,
	"predictions": sentences_predictions,
	}
	)
	)

	gold_matrix = np.array(sentences_labels)
	prediction_matrix = np.array(sentences_predictions)

	# Compute the scores for each label (country) on its own
	accuracy_scores = [
	accuracy_score(y_true=gold_matrix[:, i], y_pred=prediction_matrix[:, i]) * 100
	for i in range(gold_matrix.shape[1])
	]
	precision_scores = [
	precision_score(
	y_true=gold_matrix[:, i],
	y_pred=prediction_matrix[:, i],
	average="binary",
	pos_label="1",
	)
	* 100
	for i in range(gold_matrix.shape[1])
	]
	recall_scores = [
	recall_score(
	y_true=gold_matrix[:, i],
	y_pred=prediction_matrix[:, i],
	average="binary",
	pos_label="1",
	)
	* 100
	for i in range(gold_matrix.shape[1])
	]
	f1_scores = [
	f1_score(
	y_true=gold_matrix[:, i],
	y_pred=prediction_matrix[:, i],
	average="binary",
	pos_label="1",
	)
	* 100
	for i in range(gold_matrix.shape[1])
	]

	# Compute the averaged scores
	average_accuracy = np.mean(accuracy_scores)
	average_precision = np.mean(precision_scores)
	average_recall = np.mean(recall_scores)
	average_f1 = np.mean(f1_scores)

	st.write(f"Average Accuracy: {average_accuracy:.2f}%")
	st.write(f"Average Precision: {average_precision:.2f}%")
	st.write(f"Average Recall: {average_recall:.2f}%")
	st.write(f"Average F1: {average_f1:.2f}%")