AMR-KELEG commited on
Commit
0382281
·
1 Parent(s): 465af14

Populate the leaderboard

Browse files
Files changed (2) hide show
  1. app.py +54 -27
  2. utils.py +10 -3
app.py CHANGED
@@ -11,13 +11,63 @@ from inspect import getmembers, isfunction
11
  import eval_utils
12
  import utils
13
  import numpy as np
 
14
  from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
15
 
16
 
17
  tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
18
 
19
  with tab1:
20
- st.write("Leaderboard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  with tab2:
23
  model_name = st.text_input("Enter a model's name on HF")
@@ -25,6 +75,7 @@ with tab2:
25
  "Inference Method",
26
  [func_name for func_name, _ in getmembers(eval_utils, isfunction)],
27
  )
 
28
  if model_name:
29
  tokenizer = AutoTokenizer.from_pretrained(model_name)
30
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
@@ -45,31 +96,7 @@ with tab2:
45
 
46
  # Store the predictions in a private dataset
47
  utils.upload_predictions(
48
- os.environ["PREDICTIONS_DATASET_NAME"], predictions, model_name
49
  )
50
 
51
- # Evaluate the model
52
- accuracy_scores = {}
53
- f1_scores = {}
54
- recall_scores = {}
55
- precision_scores = {}
56
-
57
- for dialect in DIALECTS_WITH_LABELS:
58
- y_true = labels[dialect]
59
- y_pred = [dialect in prediction for prediction in predictions]
60
- accuracy = accuracy_score(y_true, y_pred)
61
- f1 = f1_score(y_true, y_pred)
62
- recall = recall_score(y_true, y_pred)
63
- precision = precision_score(y_true, y_pred)
64
-
65
- accuracy_scores[dialect] = accuracy
66
- f1_scores[dialect] = f1
67
- recall_scores[dialect] = recall
68
- precision_scores[dialect] = precision
69
-
70
- macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
71
- macro_avg_f1 = np.mean(list(f1_scores.values()))
72
- macro_avg_recall = np.mean(list(recall_scores.values()))
73
- macro_avg_precision = np.mean(list(precision_scores.values()))
74
-
75
- st.toast(f"Evaluation completed!")
 
11
  import eval_utils
12
  import utils
13
  import numpy as np
14
+ import pandas as pd
15
  from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
16
 
17
 
18
  tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
19
 
20
  with tab1:
21
+ # Load the labels
22
+ dataset_name = os.environ["DATASET_NAME"]
23
+ dataset = datasets.load_dataset(dataset_name)["test"]
24
+ labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
25
+
26
+ # Load the models' predictions
27
+ model_predictions_rows = datasets.load_dataset(
28
+ os.environ["PREDICTIONS_DATASET_NAME"]
29
+ )["train"]
30
+
31
+ evaluation_metrics = []
32
+ for row in model_predictions_rows:
33
+ # Evaluate the models
34
+ accuracy_scores = {}
35
+ f1_scores = {}
36
+ recall_scores = {}
37
+ precision_scores = {}
38
+ predictions = row["predictions"]
39
+
40
+ for dialect in DIALECTS_WITH_LABELS:
41
+ y_true = labels[dialect]
42
+ y_pred = [dialect in prediction for prediction in predictions]
43
+ accuracy = accuracy_score(y_true, y_pred)
44
+ f1 = f1_score(y_true, y_pred)
45
+ recall = recall_score(y_true, y_pred)
46
+ precision = precision_score(y_true, y_pred)
47
+
48
+ accuracy_scores[dialect] = accuracy
49
+ f1_scores[dialect] = f1
50
+ recall_scores[dialect] = recall
51
+ precision_scores[dialect] = precision
52
+
53
+ macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
54
+ macro_avg_f1 = np.mean(list(f1_scores.values()))
55
+ macro_avg_recall = np.mean(list(recall_scores.values()))
56
+ macro_avg_precision = np.mean(list(precision_scores.values()))
57
+
58
+ evaluation_metrics.append(
59
+ {
60
+ "model_name": row["model_name"],
61
+ "macro_avg_accuracy": macro_avg_accuracy,
62
+ "macro_avg_f1": macro_avg_f1,
63
+ "macro_avg_recall": macro_avg_recall,
64
+ "macro_avg_precision": macro_avg_precision,
65
+ }
66
+ )
67
+ results_df = pd.DataFrame(evaluation_metrics).sort_values(
68
+ "macro_avg_f1", ascending=False
69
+ )
70
+ st.table(results_df)
71
 
72
  with tab2:
73
  model_name = st.text_input("Enter a model's name on HF")
 
75
  "Inference Method",
76
  [func_name for func_name, _ in getmembers(eval_utils, isfunction)],
77
  )
78
+
79
  if model_name:
80
  tokenizer = AutoTokenizer.from_pretrained(model_name)
81
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
 
96
 
97
  # Store the predictions in a private dataset
98
  utils.upload_predictions(
99
+ os.environ["PREDICTIONS_DATASET_NAME"], predictions, model_name, inference_function
100
  )
101
 
102
+ st.toast(f"Inference completed!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -8,13 +8,20 @@ def current_seconds_time():
8
  return round(time.time())
9
 
10
 
11
- def upload_predictions(repo_id, predictions, model_name):
12
  api = HfApi()
13
 
 
14
  predictions_filename = (
15
- f"predictions_{current_seconds_time()}_{re.sub('/', '_', model_name)}.json"
16
  )
17
- predictions_object = {"model_name": model_name, "predictions": predictions}
 
 
 
 
 
 
18
 
19
  with open(predictions_filename, "w") as f:
20
  json.dump(predictions_object, f)
 
8
  return round(time.time())
9
 
10
 
11
+ def upload_predictions(repo_id, predictions, model_name, inference_function):
12
  api = HfApi()
13
 
14
+ timestamp = current_seconds_time()
15
  predictions_filename = (
16
+ f"predictions_{timestamp}_{re.sub('/', '_', model_name)}.json"
17
  )
18
+
19
+ predictions_object = {
20
+ "model_name": model_name,
21
+ "predictions": predictions,
22
+ "timestamp": timestamp,
23
+ "inference_function": inference_function,
24
+ }
25
 
26
  with open(predictions_filename, "w") as f:
27
  json.dump(predictions_object, f)