Spaces:

AMR-KELEG
/

MLADI

Running

App Files Files Community

AMR-KELEG commited on Oct 6, 2024

Commit

48a308f

1 Parent(s): 80852b8

Compute the Evaluation Metrics

Browse files

Files changed (2) hide show

app.py +73 -2
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # TODO: requirments.txt
 import os
 import pandas as pd
 import streamlit as st
@@ -7,6 +8,7 @@ import torch
 import datasets
 from tqdm import tqdm
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 model_name = st.text_input("Enter a model's name on HF")
 # MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
@@ -32,6 +34,21 @@ DIALECTS = [
 ]
 assert len(DIALECTS) == 18
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
@@ -53,6 +70,11 @@ def predict_top_p(text, P=0.9):
         if total_prob >= P:
             break
     return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
@@ -65,9 +87,8 @@ sentences_labels, sentences_predictions = [], []
 for sample in tqdm(dataset):
     text = sample["sentence"]
     labels = [
-        DIALECTS[i]
         for i in range(len(DIALECTS))
-        if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1
     ]
     pred = predict_top_p(text)
     sentences_labels.append(labels)
@@ -82,3 +103,53 @@ st.table(
         }
     )
 )

 # TODO: requirments.txt
 import os
+import numpy as np
 import pandas as pd
 import streamlit as st
 import datasets
 from tqdm import tqdm
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
 model_name = st.text_input("Enter a model's name on HF")
 # MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
 ]
 assert len(DIALECTS) == 18
+DIALECTS_WITH_LABELS = [
+    "Algeria",
+    "Egypt",
+    "Iraq",
+    "Jordan",
+    "Morocco",
+    "Palestine",
+    "Saudi_Arabia",
+    "Sudan",
+    "Syria",
+    "Tunisia",
+    "Yemen",
+]
+assert len(DIALECTS_WITH_LABELS) == 11
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
         if total_prob >= P:
             break
+    return [
+        predictions[i]
+        for i, dialect in enumerate(DIALECTS)
+        if dialect in DIALECTS_WITH_LABELS
+    ]
     return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
 for sample in tqdm(dataset):
     text = sample["sentence"]
     labels = [
+        1 if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1 else 0
         for i in range(len(DIALECTS))
     ]
     pred = predict_top_p(text)
     sentences_labels.append(labels)
         }
     )
 )
+gold_matrix = np.array(sentences_labels)
+prediction_matrix = np.array(sentences_predictions)
+# Compute the scores for each label (country) on its own
+accuracy_scores = [
+    accuracy_score(y_true=gold_matrix[:, i], y_pred=prediction_matrix[:, i]) * 100
+    for i in range(gold_matrix.shape[1])
+]
+precision_scores = [
+    precision_score(
+        y_true=gold_matrix[:, i],
+        y_pred=prediction_matrix[:, i],
+        average="binary",
+        pos_label="1",
+    )
+    * 100
+    for i in range(gold_matrix.shape[1])
+]
+recall_scores = [
+    recall_score(
+        y_true=gold_matrix[:, i],
+        y_pred=prediction_matrix[:, i],
+        average="binary",
+        pos_label="1",
+    )
+    * 100
+    for i in range(gold_matrix.shape[1])
+]
+f1_scores = [
+    f1_score(
+        y_true=gold_matrix[:, i],
+        y_pred=prediction_matrix[:, i],
+        average="binary",
+        pos_label="1",
+    )
+    * 100
+    for i in range(gold_matrix.shape[1])
+]
+# Compute the averaged scores
+average_accuracy = np.mean(accuracy_scores)
+average_precision = np.mean(precision_scores)
+average_recall = np.mean(recall_scores)
+average_f1 = np.mean(f1_scores)
+st.write(f"Average Accuracy: {average_accuracy:.2f}%")
+st.write(f"Average Precision: {average_precision:.2f}%")
+st.write(f"Average Recall: {average_recall:.2f}%")
+st.write(f"Average F1: {average_f1:.2f}%")

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ transformers
 torch
 datasets
 pandas

 torch
 datasets
 pandas
+numpy