djstrong commited on
Commit
0d2a785
·
1 Parent(s): 331e613

add perplexity

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. src/about.py +3 -0
  3. src/display/utils.py +1 -1
  4. src/leaderboard/read_evals.py +27 -4
app.py CHANGED
@@ -246,7 +246,7 @@ with demo:
246
  interactive=False,
247
  visible=True,
248
  # column_widths=["2%", "33%"]
249
- height=900
250
  )
251
 
252
  # Dummy leaderboard for handling the case when the user uses backspace key
 
246
  interactive=False,
247
  visible=True,
248
  # column_widths=["2%", "33%"]
249
+ height=800
250
  )
251
 
252
  # Dummy leaderboard for handling the case when the user uses backspace key
src/about.py CHANGED
@@ -20,6 +20,7 @@ class Tasks(Enum):
20
  task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
21
  task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
22
  task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
 
23
  task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
24
  task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
25
  task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
@@ -31,6 +32,7 @@ class Tasks(Enum):
31
  task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
32
  task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
33
  task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
 
34
 
35
  NUM_FEWSHOT = 0 # Change with your few shot
36
  # ---------------------------------------------------
@@ -72,6 +74,7 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
72
  * add metadata for models (e.g. #Params)
73
  * add more tasks
74
  * use model templates
 
75
 
76
  ## Tasks
77
 
 
20
  task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
21
  task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
22
  task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
23
+ #task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
24
  task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
25
  task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
26
  task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
 
32
  task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
33
  task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
34
  task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
35
+ task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "polish_poleval2018_task3_test_10k", "other")
36
 
37
  NUM_FEWSHOT = 0 # Change with your few shot
38
  # ---------------------------------------------------
 
74
  * add metadata for models (e.g. #Params)
75
  * add more tasks
76
  * use model templates
77
+ * fix scrolling on Firefox
78
 
79
  ## Tasks
80
 
src/display/utils.py CHANGED
@@ -26,6 +26,7 @@ auto_eval_column_dict = []
26
  # Init
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
29
  auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
30
  #Scores
31
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
@@ -39,7 +40,6 @@ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Arch
39
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
40
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
41
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
42
- auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str", True)])
43
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
44
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
45
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 
26
  # Init
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str", True)])
30
  auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
31
  #Scores
32
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
40
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
41
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
42
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 
43
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
44
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
45
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
src/leaderboard/read_evals.py CHANGED
@@ -33,6 +33,7 @@ class EvalResult:
33
  date: str = "" # submission date of request file
34
  still_on_hub: bool = False
35
  n_shot: NShotType = NShotType.n0
 
36
 
37
  @classmethod
38
  def init_from_json_file(self, json_filepath, n_shot_num):
@@ -48,6 +49,7 @@ class EvalResult:
48
 
49
  # Get model and org
50
  org_and_model = config.get("model_name", config.get("model_args", None))
 
51
  SPICHLERZ_ORG = "speakleash/"
52
 
53
  if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
@@ -91,7 +93,10 @@ class EvalResult:
91
  if accs.size == 0 or any([acc is None for acc in accs]):
92
  continue
93
 
94
- mean_acc = np.mean(accs) * 100.0
 
 
 
95
  results[task.benchmark] = mean_acc
96
 
97
  return self(
@@ -104,7 +109,8 @@ class EvalResult:
104
  revision= config.get("model_sha", ""),
105
  still_on_hub=still_on_hub,
106
  architecture=architecture,
107
- n_shot=NShotType.from_str(n_shot_num)
 
108
  )
109
 
110
  def update_with_metadata(self, metadata):
@@ -139,10 +145,10 @@ class EvalResult:
139
 
140
  def to_dict(self):
141
  """Converts the Eval Result to a dict compatible with our dataframe display"""
142
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
143
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
144
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
145
-
 
146
  average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
147
  average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
148
 
@@ -352,4 +358,21 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
352
  print(f"not all eval values present {v.eval_name} {v.full_model}")
353
  continue
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  return results
 
33
  date: str = "" # submission date of request file
34
  still_on_hub: bool = False
35
  n_shot: NShotType = NShotType.n0
36
+ org_and_model: str = ""
37
 
38
  @classmethod
39
  def init_from_json_file(self, json_filepath, n_shot_num):
 
49
 
50
  # Get model and org
51
  org_and_model = config.get("model_name", config.get("model_args", None))
52
+ orig_org_and_model = org_and_model
53
  SPICHLERZ_ORG = "speakleash/"
54
 
55
  if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
 
93
  if accs.size == 0 or any([acc is None for acc in accs]):
94
  continue
95
 
96
+ if 'perplexity' in task.metric:
97
+ mean_acc = np.mean(accs)
98
+ else:
99
+ mean_acc = np.mean(accs) * 100.0
100
  results[task.benchmark] = mean_acc
101
 
102
  return self(
 
109
  revision= config.get("model_sha", ""),
110
  still_on_hub=still_on_hub,
111
  architecture=architecture,
112
+ n_shot=NShotType.from_str(n_shot_num),
113
+ org_and_model=orig_org_and_model
114
  )
115
 
116
  def update_with_metadata(self, metadata):
 
145
 
146
  def to_dict(self):
147
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
148
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
149
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
150
+ all_tasks = g_tasks + mc_tasks
151
+ average = sum([v for task,v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
152
  average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
153
  average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
154
 
 
358
  print(f"not all eval values present {v.eval_name} {v.full_model}")
359
  continue
360
 
361
+ missing_results_for_task = {}
362
+ for v in eval_results.values():
363
+ r = v.to_dict()
364
+ for task in Tasks:
365
+ if r[task.value.col_name] is None:
366
+ task_name = f"{r['n_shot']}|{task.value.benchmark}"
367
+ if task_name in missing_results_for_task:
368
+ missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
369
+ else:
370
+ missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
371
+
372
+ # print('missing_results_for_task', missing_results_for_task)
373
+ for task, models in missing_results_for_task.items():
374
+ print(f"Missing results for {task} for {len(models)} models")
375
+ print(" ".join(models))
376
+
377
+
378
  return results