Spaces:

AMR-KELEG
/

MLADI

Running

App Files Files Community

AMR-KELEG commited on Nov 2, 2024

Commit

5a3355b

1 Parent(s): 2ba85d4

Run the evaluation in background

Browse files

Files changed (4) hide show

app.py +6 -38
background_inference.py +37 -0
leaderboard_info.md +16 -0
script.py +0 -5

app.py CHANGED Viewed

@@ -16,7 +16,9 @@ from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_sc
 st.title("NADI 2024 Leaderboard")
-MARKDOWN_TEXT = """TODO"""
 st.markdown(MARKDOWN_TEXT)
 tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
@@ -172,41 +174,7 @@ with tab2:
         )
     if model_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSequenceClassification.from_pretrained(model_name)
-        # Load the dataset
-        dataset_name = os.environ["DATASET_NAME"]
-        dataset = datasets.load_dataset(dataset_name)["test"]
-        sentences = dataset["sentence"]
-        labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
-        # TODO: Perform the inference in batches?
-        progress_text = f"Performing inference on {len(sentences)} sentences..."
-        progress_bar = st.progress(0, text=progress_text)
-        subprocess.Popen(["python", "script.py"])
-        # TODO: Switch to stqdm
-        predictions = []
-        for i, sentence in enumerate(sentences):
-            predictions.append(
-                getattr(eval_utils, inference_function)(model, tokenizer, sentence)
-            )
-            progress_bar.progress(
-                min(i / len(sentences), 1),
-                text=progress_text,
-            )
-            print(f"{model_name} - Progress: {i/len(sentences)}")
-        progress_bar.empty()
-        # Store the predictions in a private dataset
-        utils.upload_predictions(
-            os.environ["PREDICTIONS_DATASET_NAME"],
-            predictions,
-            model_name,
-            inference_function,
         )
-        st.toast(f"Inference completed!")

 st.title("NADI 2024 Leaderboard")
+with open("leaderboard.md", "r") as f:
+    MARKDOWN_TEXT = f.read()
 st.markdown(MARKDOWN_TEXT)
 tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
         )
     if model_name:
+        subprocess.Popen(
+            ["python", "background_inference.py", model_name, inference_function]
         )
+        st.info(f"Your evaluation request is being processed.")

background_inference.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import sys
+import utils
+import datasets
+import eval_utils
+from constants import DIALECTS_WITH_LABELS
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+model_name = sys.argv[1]
+inference_function = sys.argv[2]
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+# Load the dataset
+dataset_name = os.environ["DATASET_NAME"]
+dataset = datasets.load_dataset(dataset_name)["test"]
+sentences = dataset["sentence"]
+labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
+predictions = []
+for i, sentence in enumerate(sentences):
+    predictions.append(
+        getattr(eval_utils, inference_function)(model, tokenizer, sentence)
+    )
+    print("Inference progress: ", round((i + 1) / len(sentences), 2))
+# Store the predictions in a private dataset
+utils.upload_predictions(
+    os.environ["PREDICTIONS_DATASET_NAME"],
+    predictions,
+    model_name,
+    inference_function,
+)
+print(f"Inference completed!")

leaderboard_info.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# NADI 2024 Leaderboard
+This leaderboard serves as a public interface for benchmarking Arabic Dialect Identification (ADI) models using the NADI 2024 dataset, the first multi-label country-level ADI dataset.
+## Test Set Details
+The test set used for evaluation is composed of 1000 sentences geolocated to the 14 most-populated Arab countries (excluding Somalia from which data was scarce). Each sample is annotated by native speakers recruited from 9 different Arab countries, namely: Algeria, Egypt, Iraq, Morocco, Palestine, Sudan, Syria, Tunisia, Yemen.
+## Evaluation Metrics
+We compute the precision, recall, and F1 scores for each of the 9 countries (treating each label as a binary classification problem). Afterward,
+## Data Access
+If you need to access the single-label training sets, and the multi-label development set, please fill the following form: https://forms.gle/t3QTC6ZqyDJBzAau8
+#### Further Notes
+* The beta version of the leaderboard is running on limited resources, and is not able to evaluate models with a relatively large number of parameters.
+* Please refer to the [paper](https://aclanthology.org/2024.arabicnlp-1.79/) for more information about how the data was curated and annotated.
+* We are planning to extend the annotations to include more country-level dialects. If you are interested in helping, please ping us, and we are happy to discuss it further.

script.py DELETED Viewed

@@ -1,5 +0,0 @@
-import time
-for i in range(1000):
-    time.sleep(1)
-    print(i)