rahulnair23 commited on
Commit
beff7ec
Β·
1 Parent(s): d39c67a

updates + MMLU + XSUM

Browse files
app.py CHANGED
@@ -8,7 +8,7 @@ from selfrank.algos.iterative import SelfRank
8
  from selfrank.algos.baseline import MCARank
9
  from selfrank.algos.triplet import equality, rouge
10
  import matplotlib.pyplot as plt
11
-
12
 
13
  class UI:
14
 
@@ -19,33 +19,55 @@ class UI:
19
  def header_block(self):
20
  """Title/description"""
21
 
22
- with open("assets/header.md", 'r') as f:
23
  content = f.read()
24
-
25
- gr.Markdown(content)
26
- gr.Markdown('---')
27
- gr.Markdown('<br>')
28
 
 
 
 
29
 
30
  def selection_panel(self):
31
  """user selections"""
32
  gr.Markdown("""<h1 style='color: purple;'> Ranking with benchmarks </h1> """)
33
- gr.Markdown("""Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data.""")
34
- with gr.Column(variant='compact'):
 
 
35
  self.data = gr.Dropdown(
36
  choices=["CNN/DM", "XSUM", "MMLU"],
37
- multiselect=False, value='CNN/DM',
 
38
  label="Choose a dataset.",
39
  info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
40
  interactive=True,
41
  )
 
42
  self.evaluation = gr.Dropdown(
43
  choices=["Rouge", "Equality"],
44
- multiselect=False, value='Rouge',
 
45
  interactive=True,
46
  label="Evaluation function",
47
  info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
48
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  self.nmodels = gr.Dropdown(
50
  choices=["All", 10, 20, 30],
51
  label="Number of models",
@@ -64,46 +86,48 @@ class UI:
64
  choices=["Greedy", "Full"],
65
  label="Algorithm variant to use",
66
  info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
67
- value='Full',
68
  interactive=True,
69
  )
70
  self.btn_execute = gr.Button("Run")
71
 
72
-
73
  def output_panel(self):
74
  """Plots/leaderboard/bump charts"""
75
- with gr.Column(variant='default'):
76
  gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
77
- self.leaderboard = gr.DataFrame()
 
78
 
79
- with gr.Column(variant='default'):
80
- gr.Markdown("""<h2 style='color: purple;'> Comparison to 'true' ranking </h2> """)
81
- #self.bumpchart = gr.Plot(format='png')
 
 
82
  self.bumpchart = gr.Image()
83
  self.eval_metrics = gr.Markdown()
84
-
85
  def synth_panel(self):
86
- """ Synthetic data experiments """
87
- gr.Markdown('<br>')
88
- gr.Markdown('---')
89
  gr.Markdown("""<h1 style='color: purple;'>Synthetic multiple choice </h1> """)
90
  gr.Markdown("Coming soon.")
91
 
92
  def byod_panel(self):
93
- """ Instructions panel """
94
- gr.Markdown('<br>')
95
- gr.Markdown('---')
96
- with open("assets/instructions.md", 'r') as f:
97
  content = f.read()
98
  gr.Markdown(content)
99
- gr.Markdown('---')
100
-
101
  def load_css(self):
102
- with open('style.css', 'r') as file:
103
  self.css = file.read()
104
-
105
  def layout(self):
106
- """ Assemble the overall layout """
107
 
108
  with gr.Blocks(theme=gr.themes.Default()) as demo:
109
  self.header_block()
@@ -117,67 +141,81 @@ class UI:
117
  # Output panel/leaderboard
118
  self.output_panel()
119
 
120
- #TODO: self.synth_panel()
121
  self.byod_panel()
122
-
123
  # Register event listeners
124
  self.btn_execute.click(
125
- fn=self.benchmark_executor, inputs=[self.data, self.evaluation, self.nmodels, self.nrows, self.method],
126
- outputs=[self.leaderboard, self.bumpchart, self.eval_metrics]
 
 
 
 
 
 
 
 
127
  )
128
 
129
- return demo
130
-
131
- def benchmark_executor(self, data, evaluation, nmodels, nrows, method) -> tuple[pd.DataFrame, plt.figure]:
132
- """ Main execution flow for benchmarks """
133
-
134
- #gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
 
 
135
  seed = 40
136
  np.random.seed(seed)
137
-
138
  match data:
139
- case 'MMLU':
140
- adf = pd.read_pickle(f"data/mmlu_subject_abstract_algebra.pkl")
141
- MODELS = adf.model.unique()
142
-
143
- case 'CNN/DM':
144
  adf = pd.read_pickle(f"data/cnndm.pkl")
145
- MODELS = adf.model.unique()
146
 
147
- case 'XSUM':
148
- raise NotImplementedError
149
-
150
  case _:
151
  raise ValueError(f"'{data}' not understood.")
152
 
 
 
153
  # Sample fewer models if so needed
154
  if nmodels != "All":
155
  if nmodels < len(MODELS):
156
-
157
  MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
158
  adf = adf[adf.model.isin(MODELS)]
159
 
160
  match data:
161
- case 'MMLU':
162
- keys = ["id", "trial_id", "perturbation"] # MMLU has this extra parameter
163
- case 'CNN/DM':
 
 
 
 
164
  keys = ["id", "trial_id"]
165
  case _:
166
  pass
167
 
168
  df = adf.pivot_table(
169
- columns="model",
170
- index=keys,
171
- values="output",
172
- aggfunc="first",
173
- )
174
 
175
  # Filter by number of rows
176
  df.dropna(inplace=True)
177
  if nrows != "All":
178
  if nrows < df.shape[0]:
179
  df = df.sample(nrows, random_state=seed)
180
-
181
  # Compute true ranking
182
  adf = adf.set_index(keys).loc[df.index].reset_index()
183
 
@@ -190,7 +228,7 @@ class UI:
190
  adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
191
  delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
192
  )
193
-
194
  # Method 2 - look at "win rates" - for each question, see which model
195
  # wins (i.e. has the best ROUGE score)
196
  idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
@@ -201,8 +239,8 @@ class UI:
201
  no_wins = list(set(MODELS) - set(win_rate_rank))
202
  true_ranking = win_rate_rank + no_wins
203
  evaluator = rouge
204
-
205
- elif evaluation == 'Equality':
206
 
207
  # Compute the true ranking (multiple choice - so use equality between
208
  # LLM response and reference-value)
@@ -217,52 +255,55 @@ class UI:
217
 
218
  else:
219
  raise ValueError(f"'{evaluation}' not understood.")
220
-
221
  match method:
222
- case 'Full':
223
  ranker = SelfRank(MODELS, evaluator, true_ranking)
224
-
225
- case 'Greedy':
226
  ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
227
-
228
- case 'MCA':
229
  raise NotImplementedError
230
  case _:
231
  raise ValueError(f"'{method}' not understood.")
232
-
233
-
234
  # generate outputs
235
  ranker.fit(df)
236
  ranks = ranker.ranking
237
- from itertools import zip_longest
238
- ranks = [j + i for i, j in zip_longest(ranks, ["πŸ₯‡ ", "πŸ₯ˆ ", "πŸ₯‰ "], fillvalue='')]
239
- out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranks})
 
 
240
 
241
- out_metrics = {"rbo": ranker.measure(metric="rbo"),
 
242
  "map-1": ranker.measure(metric="mapk", k=1),
243
  "map-3": ranker.measure(metric="mapk", k=3),
244
  "map-5": ranker.measure(metric="mapk", k=5),
245
  "map-10": ranker.measure(metric="mapk", k=10),
246
- "evaluations": evaluator.calls
247
  }
248
- eval_metrics = (f"<h2> Evaluation measures </h2>"
249
- f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
250
- f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
251
- f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
252
- f"MAP-10 : {out_metrics['map-10']: 0.3f}.")
 
 
253
 
254
  out_plot = ranker.plot()
255
-
256
- return out_df, "output.png", eval_metrics
257
 
 
258
 
259
  def run(self):
260
  self.ui = self.layout()
261
  self.ui.queue().launch(show_error=True)
262
 
263
 
264
- #if __name__ == "__main__":
265
  ui = UI()
266
- #ui.run()
267
  demo = ui.layout()
268
  demo.launch()
 
8
  from selfrank.algos.baseline import MCARank
9
  from selfrank.algos.triplet import equality, rouge
10
  import matplotlib.pyplot as plt
11
+ from itertools import zip_longest
12
 
13
  class UI:
14
 
 
19
  def header_block(self):
20
  """Title/description"""
21
 
22
+ with open("assets/header.md", "r") as f:
23
  content = f.read()
 
 
 
 
24
 
25
+ gr.Markdown(content)
26
+ gr.Markdown("---")
27
+ gr.Markdown("<br>")
28
 
29
  def selection_panel(self):
30
  """user selections"""
31
  gr.Markdown("""<h1 style='color: purple;'> Ranking with benchmarks </h1> """)
32
+ gr.Markdown(
33
+ """Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data."""
34
+ )
35
+ with gr.Column(variant="compact"):
36
  self.data = gr.Dropdown(
37
  choices=["CNN/DM", "XSUM", "MMLU"],
38
+ multiselect=False,
39
+ value="CNN/DM",
40
  label="Choose a dataset.",
41
  info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
42
  interactive=True,
43
  )
44
+ self.mmlu = gr.Dropdown(visible=False)
45
  self.evaluation = gr.Dropdown(
46
  choices=["Rouge", "Equality"],
47
+ multiselect=False,
48
+ value="Rouge",
49
  interactive=True,
50
  label="Evaluation function",
51
  info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
52
  )
53
+
54
+ def update_mmlu(v):
55
+ if v == "MMLU":
56
+ return gr.Dropdown(
57
+ choices=list(['abstract_algebra', 'college_chemistry', 'computer_security', 'econometrics', 'us_foreign_policy']),
58
+ value='us_foreign_policy',
59
+ multiselect=False,
60
+ label="Choose MMLU subject.",
61
+ info="MMLU subject area.",
62
+ interactive=True,
63
+ visible=True,
64
+ ), gr.Dropdown(choices=['Equality'], value='Equality')
65
+ else:
66
+ return gr.Dropdown(visible=False), gr.Dropdown(choices=['Rouge'], value='Rouge')
67
+
68
+ self.data.change(fn=update_mmlu, inputs=self.data, outputs=[self.mmlu, self.evaluation])
69
+
70
+
71
  self.nmodels = gr.Dropdown(
72
  choices=["All", 10, 20, 30],
73
  label="Number of models",
 
86
  choices=["Greedy", "Full"],
87
  label="Algorithm variant to use",
88
  info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
89
+ value="Full",
90
  interactive=True,
91
  )
92
  self.btn_execute = gr.Button("Run")
93
 
 
94
  def output_panel(self):
95
  """Plots/leaderboard/bump charts"""
96
+ with gr.Column(variant="default"):
97
  gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
98
+ self.leaderboard = gr.DataFrame(headers=["rank", "model"],
99
+ datatype=["number", "str"])
100
 
101
+ with gr.Column(variant="default"):
102
+ gr.Markdown(
103
+ """<h2 style='color: purple;'> Comparison to 'true' ranking </h2> """
104
+ )
105
+ # self.bumpchart = gr.Plot(format='png')
106
  self.bumpchart = gr.Image()
107
  self.eval_metrics = gr.Markdown()
108
+
109
  def synth_panel(self):
110
+ """Synthetic data experiments"""
111
+ gr.Markdown("<br>")
112
+ gr.Markdown("---")
113
  gr.Markdown("""<h1 style='color: purple;'>Synthetic multiple choice </h1> """)
114
  gr.Markdown("Coming soon.")
115
 
116
  def byod_panel(self):
117
+ """Instructions panel"""
118
+ gr.Markdown("<br>")
119
+ gr.Markdown("---")
120
+ with open("assets/instructions.md", "r") as f:
121
  content = f.read()
122
  gr.Markdown(content)
123
+ gr.Markdown("---")
124
+
125
  def load_css(self):
126
+ with open("style.css", "r") as file:
127
  self.css = file.read()
128
+
129
  def layout(self):
130
+ """Assemble the overall layout"""
131
 
132
  with gr.Blocks(theme=gr.themes.Default()) as demo:
133
  self.header_block()
 
141
  # Output panel/leaderboard
142
  self.output_panel()
143
 
144
+ # TODO: self.synth_panel()
145
  self.byod_panel()
146
+
147
  # Register event listeners
148
  self.btn_execute.click(
149
+ fn=self.benchmark_executor,
150
+ inputs=[
151
+ self.data,
152
+ self.mmlu,
153
+ self.evaluation,
154
+ self.nmodels,
155
+ self.nrows,
156
+ self.method,
157
+ ],
158
+ outputs=[self.leaderboard, self.bumpchart, self.eval_metrics],
159
  )
160
 
161
+ return demo
162
+
163
+ def benchmark_executor(
164
+ self, data, mmlu_subject, evaluation, nmodels, nrows, method
165
+ ) -> tuple[pd.DataFrame, plt.figure]:
166
+ """Main execution flow for benchmarks"""
167
+
168
+ # gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
169
  seed = 40
170
  np.random.seed(seed)
171
+
172
  match data:
173
+ case "MMLU":
174
+ adf = pd.read_pickle(f"data/mmlu_subject_{mmlu_subject}.pkl")
175
+
176
+ case "CNN/DM":
 
177
  adf = pd.read_pickle(f"data/cnndm.pkl")
 
178
 
179
+ case "XSUM":
180
+ adf = pd.read_pickle(f"data/xsum.pkl")
181
+
182
  case _:
183
  raise ValueError(f"'{data}' not understood.")
184
 
185
+ MODELS = adf.model.unique()
186
+
187
  # Sample fewer models if so needed
188
  if nmodels != "All":
189
  if nmodels < len(MODELS):
190
+
191
  MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
192
  adf = adf[adf.model.isin(MODELS)]
193
 
194
  match data:
195
+ case "MMLU":
196
+ keys = [
197
+ "id",
198
+ "trial_id",
199
+ "perturbation",
200
+ ] # MMLU has this extra parameter
201
+ case "CNN/DM" | "XSUM":
202
  keys = ["id", "trial_id"]
203
  case _:
204
  pass
205
 
206
  df = adf.pivot_table(
207
+ columns="model",
208
+ index=keys,
209
+ values="output",
210
+ aggfunc="first",
211
+ )
212
 
213
  # Filter by number of rows
214
  df.dropna(inplace=True)
215
  if nrows != "All":
216
  if nrows < df.shape[0]:
217
  df = df.sample(nrows, random_state=seed)
218
+
219
  # Compute true ranking
220
  adf = adf.set_index(keys).loc[df.index].reset_index()
221
 
 
228
  adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
229
  delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
230
  )
231
+
232
  # Method 2 - look at "win rates" - for each question, see which model
233
  # wins (i.e. has the best ROUGE score)
234
  idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
 
239
  no_wins = list(set(MODELS) - set(win_rate_rank))
240
  true_ranking = win_rate_rank + no_wins
241
  evaluator = rouge
242
+
243
+ elif evaluation == "Equality":
244
 
245
  # Compute the true ranking (multiple choice - so use equality between
246
  # LLM response and reference-value)
 
255
 
256
  else:
257
  raise ValueError(f"'{evaluation}' not understood.")
258
+
259
  match method:
260
+ case "Full":
261
  ranker = SelfRank(MODELS, evaluator, true_ranking)
262
+
263
+ case "Greedy":
264
  ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
265
+
266
+ case "MCA":
267
  raise NotImplementedError
268
  case _:
269
  raise ValueError(f"'{method}' not understood.")
270
+
 
271
  # generate outputs
272
  ranker.fit(df)
273
  ranks = ranker.ranking
274
+
275
+ ranks = [
276
+ j + i for i, j in zip_longest(ranks, ["πŸ₯‡ ", "πŸ₯ˆ ", "πŸ₯‰ "], fillvalue="")
277
+ ]
278
+ out_df = pd.DataFrame({"rank": range(1, len(true_ranking) + 1), "model": ranks})
279
 
280
+ out_metrics = {
281
+ "rbo": ranker.measure(metric="rbo"),
282
  "map-1": ranker.measure(metric="mapk", k=1),
283
  "map-3": ranker.measure(metric="mapk", k=3),
284
  "map-5": ranker.measure(metric="mapk", k=5),
285
  "map-10": ranker.measure(metric="mapk", k=10),
286
+ "evaluations": evaluator.calls,
287
  }
288
+ eval_metrics = (
289
+ f"<h2> Evaluation measures </h2>"
290
+ f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
291
+ f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
292
+ f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
293
+ f"MAP-10 : {out_metrics['map-10']: 0.3f}."
294
+ )
295
 
296
  out_plot = ranker.plot()
 
 
297
 
298
+ return out_df, "output.png", eval_metrics
299
 
300
  def run(self):
301
  self.ui = self.layout()
302
  self.ui.queue().launch(show_error=True)
303
 
304
 
305
+ # if __name__ == "__main__":
306
  ui = UI()
307
+ # ui.run()
308
  demo = ui.layout()
309
  demo.launch()
assets/instructions.md CHANGED
@@ -6,19 +6,22 @@ Source code is available as a pip installable python package.
6
 
7
  Use of a virtual enviroment is recommended.
8
  ```bash
9
- $ conda create -n selfrank python=3.10
 
 
 
 
10
  ```
11
 
12
- To install,
13
  ```bash
14
- $ conda activate selfrank
15
- $ pip install git+https://huggingface.co/spaces/ibm/llm-rank-themselves.git
16
  ```
17
 
18
  ## Usage
19
 
20
  Start by gathering model inferences for the same question/prompt across all models you want to rank. The ranking method expects a pandas dataframe, with a row for each prompt, and a column for each model, i.e.
21
- | | M1 | M2 | M3 | ... |
22
  |:-----------|:-----|:-----|:-----|:------|
23
  | Q1 | a | a | b | ... |
24
  | Q2 | a | b | b | ... |
 
6
 
7
  Use of a virtual enviroment is recommended.
8
  ```bash
9
+ conda create -n selfrank python=3.10
10
+ ```
11
+ Activate the virtual environment
12
+ ```bash
13
+ conda activate selfrank
14
  ```
15
 
16
+ and then install,
17
  ```bash
18
+ pip install git+https://huggingface.co/spaces/ibm/llm-rank-themselves.git
 
19
  ```
20
 
21
  ## Usage
22
 
23
  Start by gathering model inferences for the same question/prompt across all models you want to rank. The ranking method expects a pandas dataframe, with a row for each prompt, and a column for each model, i.e.
24
+ | | M1 | M2 | M3 | ... |
25
  |:-----------|:-----|:-----|:-----|:------|
26
  | Q1 | a | a | b | ... |
27
  | Q2 | a | b | b | ... |
data/mmlu_subject_college_chemistry.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9181b1c32b176ab903f51282a79507702ffb3f3b356d85e5bb9fad2b6c052bd6
3
+ size 8778542
data/mmlu_subject_computer_security.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7141d827015681bf279d1f833bf420698c7aff0e75c95081c2e95d334900e367
3
+ size 10070152
data/mmlu_subject_econometrics.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a0f86c63ecc1a415b30e8513018d0d2900ce7b1b265d24024501b7a0309d7d8
3
+ size 14779002
data/mmlu_subject_us_foreign_policy.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e125afd27b5a177697ae87145bac5e485fae43e6f8ef8a0405f1dc2ee63bee
3
+ size 7014950
data/xsum.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97af7f7179036f50c2aa10ce0d40a017b0bc467518057951fba6ef69e8d2a733
3
+ size 11067330