sheonhan commited on
Commit
e05ec6c
1 Parent(s): 7644705

Fix elo ratings model links

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. elo_utils.py +59 -11
app.py CHANGED
@@ -205,7 +205,7 @@ def get_leaderboard_df():
205
  def get_evaluation_queue_df():
206
  if repo:
207
  print("Pulling changes for the evaluation queue.")
208
- # repo.git_pull()
209
 
210
  entries = [
211
  entry
 
205
  def get_evaluation_queue_df():
206
  if repo:
207
  print("Pulling changes for the evaluation queue.")
208
+ repo.git_pull()
209
 
210
  entries = [
211
  entry
elo_utils.py CHANGED
@@ -8,10 +8,37 @@ from datasets import load_dataset
8
 
9
  from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
10
  from utils import make_clickable_model
11
- from visualizations import (get_bootstrap_result, switch_model_a_b,
12
- visualize_battle_count, visualize_bootstrap_scores,
13
- visualize_pairwise_win_fraction,
14
- visualize_rating_count)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  @dataclass
@@ -26,7 +53,7 @@ class EloEvalResult:
26
  def to_dict(self):
27
  base_model = f"{self.model}"
28
  data_dict = {}
29
- data_dict["Model"] = make_clickable_model(base_model)
30
  data_dict["GPT-4 (all)"] = self.gpt_4_all
31
  data_dict["Human (all)"] = self.human_all
32
  data_dict["Human (instruct)"] = self.human_instruct
@@ -61,7 +88,13 @@ def create_eval_df(df, tie_allowed):
61
  }
62
 
63
  if tie_allowed:
64
- response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie"
 
 
 
 
 
 
65
  else:
66
  response["win"] = "model_a" if response["rating"] < 5 else "model_b"
67
 
@@ -84,7 +117,13 @@ def create_eval_df_for_gpt(df, tie_allowed):
84
  }
85
 
86
  if tie_allowed:
87
- response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie"
 
 
 
 
 
 
88
  else:
89
  response["win"] = "model_a" if response["rating"] < 5 else "model_b"
90
 
@@ -124,13 +163,20 @@ def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
124
  df_all = pd.concat([df_instruct, df_code_instruct])
125
 
126
  df_gpt_4 = load_dataset(
127
- "gpt_4_evals/data/", split="train", revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846"
 
 
128
  ).to_pandas()
129
 
130
  dfs = [df_instruct, df_code_instruct, df_all]
131
- elo_ratings = [convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed)) for df in dfs]
 
 
 
132
 
133
- gpt_4_elo_ratings = convert_rating_from_float_to_int(create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed))
 
 
134
  elo_ratings.append(gpt_4_elo_ratings)
135
 
136
  results = [
@@ -166,7 +212,9 @@ def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
166
 
167
  BOOTSTRAP_ROUNDS = 1000
168
  if "bootstrap_elo_lu" not in globals():
169
- bootstrap_elo_lu = get_bootstrap_result(game_switch, compute_elo, BOOTSTRAP_ROUNDS)
 
 
170
 
171
  plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
172
 
 
8
 
9
  from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
10
  from utils import make_clickable_model
11
+ from visualizations import (
12
+ get_bootstrap_result,
13
+ switch_model_a_b,
14
+ visualize_battle_count,
15
+ visualize_bootstrap_scores,
16
+ visualize_pairwise_win_fraction,
17
+ visualize_rating_count,
18
+ )
19
+
20
+
21
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
22
+ VICUNA_LINK = "https://huggingface.co/HuggingFaceH4/stable-vicuna-13b-2904"
23
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
24
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
25
+ MODEL_PAGE = "https://huggingface.co/models"
26
+
27
+
28
+ def make_clickable_model_elo(model_name):
29
+ link = ""
30
+ if model_name == "dolly-12b":
31
+ link = DOLLY_LINK
32
+ elif model_name == "vicuna-13b":
33
+ link = VICUNA_LINK
34
+ elif model_name == "koala-13b":
35
+ link = KOALA_LINK
36
+ elif model_name == "oasst-12b":
37
+ link = OASST_LINK
38
+ else:
39
+ link = MODEL_PAGE
40
+
41
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
42
 
43
 
44
  @dataclass
 
53
  def to_dict(self):
54
  base_model = f"{self.model}"
55
  data_dict = {}
56
+ data_dict["Model"] = make_clickable_model_elo(base_model)
57
  data_dict["GPT-4 (all)"] = self.gpt_4_all
58
  data_dict["Human (all)"] = self.human_all
59
  data_dict["Human (instruct)"] = self.human_instruct
 
88
  }
89
 
90
  if tie_allowed:
91
+ response["win"] = (
92
+ "model_a"
93
+ if response["rating"] < 4
94
+ else "model_b"
95
+ if response["rating"] > 5
96
+ else "tie"
97
+ )
98
  else:
99
  response["win"] = "model_a" if response["rating"] < 5 else "model_b"
100
 
 
117
  }
118
 
119
  if tie_allowed:
120
+ response["win"] = (
121
+ "model_a"
122
+ if response["rating"] < 4
123
+ else "model_b"
124
+ if response["rating"] > 5
125
+ else "tie"
126
+ )
127
  else:
128
  response["win"] = "model_a" if response["rating"] < 5 else "model_b"
129
 
 
163
  df_all = pd.concat([df_instruct, df_code_instruct])
164
 
165
  df_gpt_4 = load_dataset(
166
+ "gpt_4_evals/data/",
167
+ split="train",
168
+ revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
169
  ).to_pandas()
170
 
171
  dfs = [df_instruct, df_code_instruct, df_all]
172
+ elo_ratings = [
173
+ convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed))
174
+ for df in dfs
175
+ ]
176
 
177
+ gpt_4_elo_ratings = convert_rating_from_float_to_int(
178
+ create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)
179
+ )
180
  elo_ratings.append(gpt_4_elo_ratings)
181
 
182
  results = [
 
212
 
213
  BOOTSTRAP_ROUNDS = 1000
214
  if "bootstrap_elo_lu" not in globals():
215
+ bootstrap_elo_lu = get_bootstrap_result(
216
+ game_switch, compute_elo, BOOTSTRAP_ROUNDS
217
+ )
218
 
219
  plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
220