AMR-KELEG commited on
Commit
5a3355b
·
1 Parent(s): 2ba85d4

Run the evaluation in background

Browse files
Files changed (4) hide show
  1. app.py +6 -38
  2. background_inference.py +37 -0
  3. leaderboard_info.md +16 -0
  4. script.py +0 -5
app.py CHANGED
@@ -16,7 +16,9 @@ from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_sc
16
 
17
 
18
  st.title("NADI 2024 Leaderboard")
19
- MARKDOWN_TEXT = """TODO"""
 
 
20
  st.markdown(MARKDOWN_TEXT)
21
 
22
  tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
@@ -172,41 +174,7 @@ with tab2:
172
  )
173
 
174
  if model_name:
175
- tokenizer = AutoTokenizer.from_pretrained(model_name)
176
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
177
-
178
- # Load the dataset
179
- dataset_name = os.environ["DATASET_NAME"]
180
- dataset = datasets.load_dataset(dataset_name)["test"]
181
-
182
- sentences = dataset["sentence"]
183
- labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
184
-
185
- # TODO: Perform the inference in batches?
186
- progress_text = f"Performing inference on {len(sentences)} sentences..."
187
- progress_bar = st.progress(0, text=progress_text)
188
-
189
- subprocess.Popen(["python", "script.py"])
190
-
191
- # TODO: Switch to stqdm
192
- predictions = []
193
- for i, sentence in enumerate(sentences):
194
- predictions.append(
195
- getattr(eval_utils, inference_function)(model, tokenizer, sentence)
196
- )
197
- progress_bar.progress(
198
- min(i / len(sentences), 1),
199
- text=progress_text,
200
- )
201
- print(f"{model_name} - Progress: {i/len(sentences)}")
202
- progress_bar.empty()
203
-
204
- # Store the predictions in a private dataset
205
- utils.upload_predictions(
206
- os.environ["PREDICTIONS_DATASET_NAME"],
207
- predictions,
208
- model_name,
209
- inference_function,
210
  )
211
-
212
- st.toast(f"Inference completed!")
 
16
 
17
 
18
  st.title("NADI 2024 Leaderboard")
19
+
20
+ with open("leaderboard.md", "r") as f:
21
+ MARKDOWN_TEXT = f.read()
22
  st.markdown(MARKDOWN_TEXT)
23
 
24
  tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
 
174
  )
175
 
176
  if model_name:
177
+ subprocess.Popen(
178
+ ["python", "background_inference.py", model_name, inference_function]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  )
180
+ st.info(f"Your evaluation request is being processed.")
 
background_inference.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import utils
4
+ import datasets
5
+ import eval_utils
6
+ from constants import DIALECTS_WITH_LABELS
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+
9
+ model_name = sys.argv[1]
10
+ inference_function = sys.argv[2]
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
14
+
15
+ # Load the dataset
16
+ dataset_name = os.environ["DATASET_NAME"]
17
+ dataset = datasets.load_dataset(dataset_name)["test"]
18
+
19
+ sentences = dataset["sentence"]
20
+ labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
21
+
22
+ predictions = []
23
+ for i, sentence in enumerate(sentences):
24
+ predictions.append(
25
+ getattr(eval_utils, inference_function)(model, tokenizer, sentence)
26
+ )
27
+ print("Inference progress: ", round((i + 1) / len(sentences), 2))
28
+
29
+ # Store the predictions in a private dataset
30
+ utils.upload_predictions(
31
+ os.environ["PREDICTIONS_DATASET_NAME"],
32
+ predictions,
33
+ model_name,
34
+ inference_function,
35
+ )
36
+
37
+ print(f"Inference completed!")
leaderboard_info.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NADI 2024 Leaderboard
2
+ This leaderboard serves as a public interface for benchmarking Arabic Dialect Identification (ADI) models using the NADI 2024 dataset, the first multi-label country-level ADI dataset.
3
+
4
+ ## Test Set Details
5
+ The test set used for evaluation is composed of 1000 sentences geolocated to the 14 most-populated Arab countries (excluding Somalia from which data was scarce). Each sample is annotated by native speakers recruited from 9 different Arab countries, namely: Algeria, Egypt, Iraq, Morocco, Palestine, Sudan, Syria, Tunisia, Yemen.
6
+
7
+ ## Evaluation Metrics
8
+ We compute the precision, recall, and F1 scores for each of the 9 countries (treating each label as a binary classification problem). Afterward,
9
+
10
+ ## Data Access
11
+ If you need to access the single-label training sets, and the multi-label development set, please fill the following form: https://forms.gle/t3QTC6ZqyDJBzAau8
12
+
13
+ #### Further Notes
14
+ * The beta version of the leaderboard is running on limited resources, and is not able to evaluate models with a relatively large number of parameters.
15
+ * Please refer to the [paper](https://aclanthology.org/2024.arabicnlp-1.79/) for more information about how the data was curated and annotated.
16
+ * We are planning to extend the annotations to include more country-level dialects. If you are interested in helping, please ping us, and we are happy to discuss it further.
script.py DELETED
@@ -1,5 +0,0 @@
1
- import time
2
-
3
- for i in range(1000):
4
- time.sleep(1)
5
- print(i)