Spaces:

AMR-KELEG
/

MLADI

Running

App Files Files Community

AMR-KELEG commited on Oct 25, 2024

Commit

0382281

1 Parent(s): 465af14

Populate the leaderboard

Browse files

Files changed (2) hide show

app.py +54 -27
utils.py +10 -3

app.py CHANGED Viewed

@@ -11,13 +11,63 @@ from inspect import getmembers, isfunction
 import eval_utils
 import utils
 import numpy as np
 from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
 tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
 with tab1:
-    st.write("Leaderboard")
 with tab2:
     model_name = st.text_input("Enter a model's name on HF")
@@ -25,6 +75,7 @@ with tab2:
         "Inference Method",
         [func_name for func_name, _ in getmembers(eval_utils, isfunction)],
     )
     if model_name:
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForSequenceClassification.from_pretrained(model_name)
@@ -45,31 +96,7 @@ with tab2:
         # Store the predictions in a private dataset
         utils.upload_predictions(
-            os.environ["PREDICTIONS_DATASET_NAME"], predictions, model_name
         )
-        # Evaluate the model
-        accuracy_scores = {}
-        f1_scores = {}
-        recall_scores = {}
-        precision_scores = {}
-        for dialect in DIALECTS_WITH_LABELS:
-            y_true = labels[dialect]
-            y_pred = [dialect in prediction for prediction in predictions]
-            accuracy = accuracy_score(y_true, y_pred)
-            f1 = f1_score(y_true, y_pred)
-            recall = recall_score(y_true, y_pred)
-            precision = precision_score(y_true, y_pred)
-            accuracy_scores[dialect] = accuracy
-            f1_scores[dialect] = f1
-            recall_scores[dialect] = recall
-            precision_scores[dialect] = precision
-        macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
-        macro_avg_f1 = np.mean(list(f1_scores.values()))
-        macro_avg_recall = np.mean(list(recall_scores.values()))
-        macro_avg_precision = np.mean(list(precision_scores.values()))
-        st.toast(f"Evaluation completed!")

 import eval_utils
 import utils
 import numpy as np
+import pandas as pd
 from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
 tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
 with tab1:
+    # Load the labels
+    dataset_name = os.environ["DATASET_NAME"]
+    dataset = datasets.load_dataset(dataset_name)["test"]
+    labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
+    # Load the models' predictions
+    model_predictions_rows = datasets.load_dataset(
+        os.environ["PREDICTIONS_DATASET_NAME"]
+    )["train"]
+    evaluation_metrics = []
+    for row in model_predictions_rows:
+        # Evaluate the models
+        accuracy_scores = {}
+        f1_scores = {}
+        recall_scores = {}
+        precision_scores = {}
+        predictions = row["predictions"]
+        for dialect in DIALECTS_WITH_LABELS:
+            y_true = labels[dialect]
+            y_pred = [dialect in prediction for prediction in predictions]
+            accuracy = accuracy_score(y_true, y_pred)
+            f1 = f1_score(y_true, y_pred)
+            recall = recall_score(y_true, y_pred)
+            precision = precision_score(y_true, y_pred)
+            accuracy_scores[dialect] = accuracy
+            f1_scores[dialect] = f1
+            recall_scores[dialect] = recall
+            precision_scores[dialect] = precision
+        macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
+        macro_avg_f1 = np.mean(list(f1_scores.values()))
+        macro_avg_recall = np.mean(list(recall_scores.values()))
+        macro_avg_precision = np.mean(list(precision_scores.values()))
+        evaluation_metrics.append(
+            {
+                "model_name": row["model_name"],
+                "macro_avg_accuracy": macro_avg_accuracy,
+                "macro_avg_f1": macro_avg_f1,
+                "macro_avg_recall": macro_avg_recall,
+                "macro_avg_precision": macro_avg_precision,
+            }
+        )
+    results_df = pd.DataFrame(evaluation_metrics).sort_values(
+        "macro_avg_f1", ascending=False
+    )
+    st.table(results_df)
 with tab2:
     model_name = st.text_input("Enter a model's name on HF")
         "Inference Method",
         [func_name for func_name, _ in getmembers(eval_utils, isfunction)],
     )
     if model_name:
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForSequenceClassification.from_pretrained(model_name)
         # Store the predictions in a private dataset
         utils.upload_predictions(
+            os.environ["PREDICTIONS_DATASET_NAME"], predictions, model_name, inference_function
         )
+        st.toast(f"Inference completed!")

utils.py CHANGED Viewed

@@ -8,13 +8,20 @@ def current_seconds_time():
     return round(time.time())
-def upload_predictions(repo_id, predictions, model_name):
     api = HfApi()
     predictions_filename = (
-        f"predictions_{current_seconds_time()}_{re.sub('/', '_', model_name)}.json"
     )
-    predictions_object = {"model_name": model_name, "predictions": predictions}
     with open(predictions_filename, "w") as f:
         json.dump(predictions_object, f)

     return round(time.time())
+def upload_predictions(repo_id, predictions, model_name, inference_function):
     api = HfApi()
+    timestamp = current_seconds_time()
     predictions_filename = (
+        f"predictions_{timestamp}_{re.sub('/', '_', model_name)}.json"
     )
+    predictions_object = {
+        "model_name": model_name,
+        "predictions": predictions,
+        "timestamp": timestamp,
+        "inference_function": inference_function,
+    }
     with open(predictions_filename, "w") as f:
         json.dump(predictions_object, f)