Maharshi Gor commited on
Commit
55d797c
·
1 Parent(s): 7acf14e

Implements leaderboard functionality and dataset download

Browse files

Adds functions to download dataset snapshots and fetch leaderboard data.
Integrates leaderboard display with automatic refresh and user-friendly interface.
Enhances logging for better error tracking and debugging.

app.py CHANGED
@@ -2,23 +2,29 @@ import datasets
2
  import gradio as gr
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
 
5
 
 
 
6
  from app_configs import DEFAULT_SELECTIONS, THEME
7
  from components.quizbowl.bonus import BonusInterface
8
  from components.quizbowl.tossup import TossupInterface
 
9
  from display.custom_css import css_bonus, css_pipeline, css_tossup
10
  from display.guide import GUIDE_MARKDOWN
 
11
 
12
  # Constants
13
  from envs import (
14
  API,
15
  EVAL_REQUESTS_PATH,
16
  EVAL_RESULTS_PATH,
 
17
  PLAYGROUND_DATASET_NAMES,
18
  QUEUE_REPO,
19
  REPO_ID,
20
  RESULTS_REPO,
21
- TOKEN,
22
  )
23
  from workflows import factory
24
  from workflows.configs import AVAILABLE_MODELS
@@ -28,107 +34,27 @@ def restart_space():
28
  API.restart_space(repo_id=REPO_ID)
29
 
30
 
31
- # Space initialisation
32
- try:
33
- print(EVAL_REQUESTS_PATH)
34
- snapshot_download(
35
- repo_id=QUEUE_REPO,
36
- local_dir=EVAL_REQUESTS_PATH,
37
- repo_type="dataset",
38
- tqdm_class=None,
39
- etag_timeout=30,
40
- token=TOKEN,
41
- )
42
- except Exception:
43
- restart_space()
44
- try:
45
- print(EVAL_RESULTS_PATH)
46
- snapshot_download(
47
- repo_id=RESULTS_REPO,
48
- local_dir=EVAL_RESULTS_PATH,
49
- repo_type="dataset",
50
- tqdm_class=None,
51
- etag_timeout=30,
52
- token=TOKEN,
53
- )
54
- except Exception:
55
- restart_space()
56
-
57
- fonts_header = """
58
- <link rel="preconnect" href="https://fonts.googleapis.com">
59
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
60
- <link href="https://fonts.googleapis.com/css2?family=Shantell+Sans:ital,wght@0,300..800;1,300..800&display=swap" rel="stylesheet">
61
- <link href="https://fonts.googleapis.com/css2?family=Space+Mono:ital,wght@0,400;0,700;1,400;1,700&display=swap" rel="stylesheet">
62
- <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:ital,wght@0,100;0,200;0,300;0,400;0,500;0,600;0,700;1,100;1,200;1,300;1,400;1,500;1,600;1,700&display=swap" rel="stylesheet">
63
- """
64
-
65
- js_head = """
66
- <script>
67
- const gradioApp = document.getElementsByTagName('gradio-app')[0];
68
- console.log("Gradio app:", gradioApp);
69
- console.log(gradioApp.querySelectorAll('.token'));
70
- console.log(document.querySelectorAll('.token'));
71
-
72
- // Function to trigger Python callback
73
- const setHiddenIndex = (index) => {
74
- console.log("Setting hidden index to:", index);
75
- const hiddenIndex = gradioApp.querySelector("#hidden-index textarea");
76
- if (hiddenIndex) {
77
- hiddenIndex.value = index;
78
- let event = new Event("input", { bubbles: true});
79
- Object.defineProperty(event, "target", { value: hiddenIndex});
80
- hiddenIndex.dispatchEvent(event);
81
- }
82
- };
83
-
84
- // Add event listeners to all tokens
85
- function setupTokenListeners() {
86
- const tokens = gradioApp.querySelectorAll('.token');
87
- console.log("Tokens:", tokens);
88
- tokens.forEach(token => {
89
- token.addEventListener('mouseover', function() {
90
- const index = parseInt(this.getAttribute('data-index'));
91
- console.log("Mouseover token index:", index);
92
-
93
- // Reset all tokens
94
- gradioApp.querySelectorAll('.token').forEach(el => {
95
- el.classList.remove('highlighted');
96
- });
97
-
98
- // Highlight this token
99
- this.classList.add('highlighted');
100
-
101
- // Update the hidden index to trigger the Python callback
102
- setHiddenIndex(index);
103
- });
104
- });
105
- }
106
- console.log("Preamble complete");
107
-
108
- document.addEventListener("DOMContentLoaded", function() {
109
- // Setup initial listeners
110
- console.log("DOM fully loaded and parsed");
111
- setupTokenListeners();
112
-
113
- // Setup a mutation observer to handle dynamically added tokens
114
- const observer = new MutationObserver(function(mutations) {
115
- mutations.forEach(function(mutation) {
116
- if (mutation.addedNodes.length) {
117
- setupTokenListeners();
118
- }
119
- });
120
- });
121
 
122
- // Start observing the token container for changes
123
- const tokenContainer = gradioApp.querySelector('.token-container');
124
- console.log("Token container:", tokenContainer);
125
- if (tokenContainer) {
126
- observer.observe(tokenContainer.parentNode, { childList: true, subtree: true });
127
- }
128
- console.log("Listener setup complete");
129
- });
130
- </script>
131
- """
132
 
133
 
134
  def load_dataset(mode: str):
@@ -144,17 +70,24 @@ def load_dataset(mode: str):
144
  return ds
145
 
146
 
 
 
 
 
 
 
147
  if __name__ == "__main__":
148
  scheduler = BackgroundScheduler()
149
- scheduler.add_job(restart_space, "interval", seconds=1800)
150
  scheduler.start()
151
 
152
- full_css = css_pipeline + css_tossup + css_bonus
 
153
  tossup_ds = load_dataset("tossup")
154
  bonus_ds = load_dataset("bonus")
155
  with gr.Blocks(
156
- css=full_css,
157
- head=fonts_header + js_head,
158
  theme=THEME,
159
  title="Quizbowl Bot",
160
  ) as demo:
@@ -162,16 +95,31 @@ if __name__ == "__main__":
162
  gr.Markdown(GUIDE_MARKDOWN)
163
  with gr.Row():
164
  gr.Markdown("## Welcome to Quizbowl Bot! This is a tool for creating and testing quizbowl agents.")
165
- with gr.Tabs():
166
- with gr.Tab("Tossup Agents"):
167
  defaults = DEFAULT_SELECTIONS["tossup"] | {
168
  "init_workflow": factory.create_simple_qb_tossup_workflow(),
169
  }
170
  tossup_interface = TossupInterface(demo, tossup_ds, AVAILABLE_MODELS, defaults)
171
- with gr.Tab("Bonus Round Agents"):
172
  defaults = DEFAULT_SELECTIONS["bonus"] | {
173
  "init_workflow": factory.create_simple_qb_bonus_workflow(),
174
  }
175
  bonus_interface = BonusInterface(demo, bonus_ds, AVAILABLE_MODELS, defaults)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  demo.queue(default_concurrency_limit=40).launch()
 
2
  import gradio as gr
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
+ from loguru import logger
6
 
7
+ import populate
8
+ from about import LEADERBOARD_INTRODUCTION_TEXT, LEADERBOARD_TITLE
9
  from app_configs import DEFAULT_SELECTIONS, THEME
10
  from components.quizbowl.bonus import BonusInterface
11
  from components.quizbowl.tossup import TossupInterface
12
+ from display.css_html_js import fonts_header, js_head, leaderboard_css
13
  from display.custom_css import css_bonus, css_pipeline, css_tossup
14
  from display.guide import GUIDE_MARKDOWN
15
+ from display.utils import AutoEvalColumn, fields
16
 
17
  # Constants
18
  from envs import (
19
  API,
20
  EVAL_REQUESTS_PATH,
21
  EVAL_RESULTS_PATH,
22
+ LEADERBOARD_REFRESH_INTERVAL,
23
  PLAYGROUND_DATASET_NAMES,
24
  QUEUE_REPO,
25
  REPO_ID,
26
  RESULTS_REPO,
27
+ SERVER_REFRESH_INTERVAL,
28
  )
29
  from workflows import factory
30
  from workflows.configs import AVAILABLE_MODELS
 
34
  API.restart_space(repo_id=REPO_ID)
35
 
36
 
37
+ def download_dataset_snapshot(repo_id, local_dir):
38
+ try:
39
+ logger.info(f"Downloading dataset snapshot from {repo_id} to {local_dir}")
40
+ snapshot_download(
41
+ repo_id=repo_id,
42
+ local_dir=local_dir,
43
+ repo_type="dataset",
44
+ tqdm_class=None,
45
+ )
46
+ except Exception as e:
47
+ logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}. Restarting space.")
48
+ restart_space()
49
+
50
+
51
+ download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+
54
+ def fetch_leaderboard_df():
55
+ logger.info("Leaderboard fetched...")
56
+ download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
57
+ return populate.get_leaderboard_df(EVAL_RESULTS_PATH)
 
 
 
 
 
58
 
59
 
60
  def load_dataset(mode: str):
 
70
  return ds
71
 
72
 
73
+ def get_default_tab_id(request: gr.Request):
74
+ logger.info(f"Request: {request}")
75
+ tab_key_value = request.query_params.get("tab", "tossup")
76
+ return gr.update(selected=tab_key_value)
77
+
78
+
79
  if __name__ == "__main__":
80
  scheduler = BackgroundScheduler()
81
+ scheduler.add_job(restart_space, "interval", seconds=SERVER_REFRESH_INTERVAL)
82
  scheduler.start()
83
 
84
+ css = css_pipeline + css_tossup + css_bonus + leaderboard_css
85
+ head = fonts_header + js_head
86
  tossup_ds = load_dataset("tossup")
87
  bonus_ds = load_dataset("bonus")
88
  with gr.Blocks(
89
+ css=css,
90
+ head=head,
91
  theme=THEME,
92
  title="Quizbowl Bot",
93
  ) as demo:
 
95
  gr.Markdown(GUIDE_MARKDOWN)
96
  with gr.Row():
97
  gr.Markdown("## Welcome to Quizbowl Bot! This is a tool for creating and testing quizbowl agents.")
98
+ with gr.Tabs() as gtab:
99
+ with gr.Tab("🛎️ Tossup Agents", id="tossup"):
100
  defaults = DEFAULT_SELECTIONS["tossup"] | {
101
  "init_workflow": factory.create_simple_qb_tossup_workflow(),
102
  }
103
  tossup_interface = TossupInterface(demo, tossup_ds, AVAILABLE_MODELS, defaults)
104
+ with gr.Tab("🙋🏻‍♂️ Bonus Round Agents", id="bonus"):
105
  defaults = DEFAULT_SELECTIONS["bonus"] | {
106
  "init_workflow": factory.create_simple_qb_bonus_workflow(),
107
  }
108
  bonus_interface = BonusInterface(demo, bonus_ds, AVAILABLE_MODELS, defaults)
109
+ with gr.Tab("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id="leaderboard"):
110
+ leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
111
+ gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
112
+ gr.Markdown(LEADERBOARD_INTRODUCTION_TEXT)
113
+ refresh_btn = gr.Button("🔄 Refresh")
114
+ leaderboard_table = gr.Dataframe(
115
+ value=fetch_leaderboard_df,
116
+ every=leaderboard_timer,
117
+ headers=[c.name for c in fields(AutoEvalColumn)],
118
+ datatype=[c.type for c in fields(AutoEvalColumn)],
119
+ elem_id="leaderboard-table",
120
+ interactive=False,
121
+ visible=True,
122
+ )
123
+ refresh_btn.click(fn=fetch_leaderboard_df, inputs=[], outputs=leaderboard_table)
124
 
125
  demo.queue(default_concurrency_limit=40).launch()
src/about.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+
5
+ @dataclass
6
+ class Task:
7
+ benchmark: str
8
+ metric: str
9
+ col_name: str
10
+
11
+
12
+ # Select your tasks here
13
+ # ---------------------------------------------------
14
+ class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ task0 = Task("anli_r1", "acc", "ANLI")
17
+ task1 = Task("logiqa", "acc_norm", "LogiQA")
18
+
19
+
20
+ NUM_FEWSHOT = 0 # Change with your few shot
21
+ # ---------------------------------------------------
22
+
23
+
24
+ # Your leaderboard name
25
+ LEADERBOARD_TITLE = """<h1 align="center" id="space-title">QANTA Leaderboard</h1>"""
26
+
27
+ # What does your leaderboard evaluate?
28
+ LEADERBOARD_INTRODUCTION_TEXT = """
29
+ Build an open-domain QA system that can answer any question posed by humans! For more: https://sites.google.com/view/qanta/home
30
+ """
31
+
32
+
33
+ # Which evaluations are you running? how can people reproduce what you have?
34
+ LLM_BENCHMARKS_TEXT = """
35
+ ## How it works
36
+
37
+ ## Reproducibility
38
+ To reproduce our results, here is the commands you can run:
39
+
40
+ """
src/components/quizbowl/populate.py CHANGED
@@ -20,7 +20,7 @@ def get_pipeline_names(competition_type: str, profile: gr.OAuthProfile | None) -
20
  demo_example_names = submit.get_demo_example_submissions(competition_type)
21
  user_model_names = submit.get_user_submission_names(competition_type, profile)
22
  all_names = demo_example_names + user_model_names
23
- logger.info("Loaded model names: {all_names}")
24
  return all_names
25
 
26
 
 
20
  demo_example_names = submit.get_demo_example_submissions(competition_type)
21
  user_model_names = submit.get_user_submission_names(competition_type, profile)
22
  all_names = demo_example_names + user_model_names
23
+ logger.info(f"Loaded model names: {all_names}")
24
  return all_names
25
 
26
 
src/display/css_html_js.py CHANGED
@@ -1,4 +1,4 @@
1
- custom_css = """
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
@@ -46,27 +46,6 @@ custom_css = """
46
  white-space: nowrap;
47
  }
48
 
49
- /* Workflow JSON styling */
50
- .workflow-json-container {
51
- margin-top: 20px;
52
- margin-bottom: 30px;
53
- }
54
-
55
- .workflow-json {
56
- border: 1px solid #ddd;
57
- border-radius: 8px;
58
- box-shadow: 0 2px 5px rgba(0,0,0,0.1);
59
- }
60
-
61
- .workflow-json pre {
62
- max-height: 500px;
63
- overflow-y: auto;
64
- }
65
-
66
- .tab-buttons button {
67
- font-size: 20px;
68
- }
69
-
70
  #scale-logo {
71
  border-style: none !important;
72
  box-shadow: none;
@@ -113,6 +92,30 @@ custom_css = """
113
  }
114
  """
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  get_window_url_params = """
117
  function(url_params) {
118
  const params = new URLSearchParams(window.location.search);
@@ -120,3 +123,80 @@ get_window_url_params = """
120
  return url_params;
121
  }
122
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ leaderboard_css = """
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
 
46
  white-space: nowrap;
47
  }
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  #scale-logo {
50
  border-style: none !important;
51
  box-shadow: none;
 
92
  }
93
  """
94
 
95
+
96
+ workflow_json_css = """
97
+ /* Workflow JSON styling */
98
+ .workflow-json-container {
99
+ margin-top: 20px;
100
+ margin-bottom: 30px;
101
+ }
102
+
103
+ .workflow-json {
104
+ border: 1px solid #ddd;
105
+ border-radius: 8px;
106
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
107
+ }
108
+
109
+ .workflow-json pre {
110
+ max-height: 500px;
111
+ overflow-y: auto;
112
+ }
113
+
114
+ .tab-buttons button {
115
+ font-size: 20px;
116
+ }
117
+ """
118
+
119
  get_window_url_params = """
120
  function(url_params) {
121
  const params = new URLSearchParams(window.location.search);
 
123
  return url_params;
124
  }
125
  """
126
+
127
+
128
+ fonts_header = """
129
+ <link rel="preconnect" href="https://fonts.googleapis.com">
130
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
131
+ <link href="https://fonts.googleapis.com/css2?family=Shantell+Sans:ital,wght@0,300..800;1,300..800&display=swap" rel="stylesheet">
132
+ <link href="https://fonts.googleapis.com/css2?family=Space+Mono:ital,wght@0,400;0,700;1,400;1,700&display=swap" rel="stylesheet">
133
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:ital,wght@0,100;0,200;0,300;0,400;0,500;0,600;0,700;1,100;1,200;1,300;1,400;1,500;1,600;1,700&display=swap" rel="stylesheet">
134
+ """
135
+
136
+ js_head = """
137
+ <script>
138
+ const gradioApp = document.getElementsByTagName('gradio-app')[0];
139
+ console.log("Gradio app:", gradioApp);
140
+ console.log(gradioApp.querySelectorAll('.token'));
141
+ console.log(document.querySelectorAll('.token'));
142
+
143
+ // Function to trigger Python callback
144
+ const setHiddenIndex = (index) => {
145
+ console.log("Setting hidden index to:", index);
146
+ const hiddenIndex = gradioApp.querySelector("#hidden-index textarea");
147
+ if (hiddenIndex) {
148
+ hiddenIndex.value = index;
149
+ let event = new Event("input", { bubbles: true});
150
+ Object.defineProperty(event, "target", { value: hiddenIndex});
151
+ hiddenIndex.dispatchEvent(event);
152
+ }
153
+ };
154
+
155
+ // Add event listeners to all tokens
156
+ function setupTokenListeners() {
157
+ const tokens = gradioApp.querySelectorAll('.token');
158
+ console.log("Tokens:", tokens);
159
+ tokens.forEach(token => {
160
+ token.addEventListener('mouseover', function() {
161
+ const index = parseInt(this.getAttribute('data-index'));
162
+ console.log("Mouseover token index:", index);
163
+
164
+ // Reset all tokens
165
+ gradioApp.querySelectorAll('.token').forEach(el => {
166
+ el.classList.remove('highlighted');
167
+ });
168
+
169
+ // Highlight this token
170
+ this.classList.add('highlighted');
171
+
172
+ // Update the hidden index to trigger the Python callback
173
+ setHiddenIndex(index);
174
+ });
175
+ });
176
+ }
177
+ console.log("Preamble complete");
178
+
179
+ document.addEventListener("DOMContentLoaded", function() {
180
+ // Setup initial listeners
181
+ console.log("DOM fully loaded and parsed");
182
+ setupTokenListeners();
183
+
184
+ // Setup a mutation observer to handle dynamically added tokens
185
+ const observer = new MutationObserver(function(mutations) {
186
+ mutations.forEach(function(mutation) {
187
+ if (mutation.addedNodes.length) {
188
+ setupTokenListeners();
189
+ }
190
+ });
191
+ });
192
+
193
+ // Start observing the token container for changes
194
+ const tokenContainer = gradioApp.querySelector('.token-container');
195
+ console.log("Token container:", tokenContainer);
196
+ if (tokenContainer) {
197
+ observer.observe(tokenContainer.parentNode, { childList: true, subtree: true });
198
+ }
199
+ console.log("Listener setup complete");
200
+ });
201
+ </script>
202
+ """
src/display/utils.py CHANGED
@@ -1,110 +1,20 @@
1
  from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
 
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
11
 
12
- # These classes are for user facing column names,
13
- # to avoid having to change them all around the code
14
- # when a modif is needed
15
  @dataclass
16
  class ColumnContent:
17
  name: str
18
  type: str
19
- displayed_by_default: bool
20
- hidden: bool = False
21
- never_hidden: bool = False
22
 
23
- ## Leaderboard columns
24
  auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
-
46
- ## For the queue columns in the submission tab
47
- @dataclass(frozen=True)
48
- class EvalQueueColumn: # Queue column
49
- model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
- status = ColumnContent("status", "str", True)
55
-
56
- ## All the model information that we might need
57
- @dataclass
58
- class ModelDetails:
59
- name: str
60
- display_name: str = ""
61
- symbol: str = "" # emoji
62
-
63
-
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
70
-
71
- def to_str(self, separator=" "):
72
- return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- @staticmethod
75
- def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- Unknown = ModelDetails("?")
95
-
96
- def from_str(precision):
97
- if precision in ["torch.float16", "float16"]:
98
- return Precision.float16
99
- if precision in ["torch.bfloat16", "bfloat16"]:
100
- return Precision.bfloat16
101
- return Precision.Unknown
102
-
103
- # Column selection
104
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
-
106
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
-
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
1
  from dataclasses import dataclass, make_dataclass
 
2
 
 
 
 
3
 
4
  def fields(raw_class):
5
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
6
 
7
 
 
 
 
8
  @dataclass
9
  class ColumnContent:
10
  name: str
11
  type: str
 
 
 
12
 
13
+
14
  auto_eval_column_dict = []
15
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown")])
16
+ auto_eval_column_dict.append(["buzz_accuracy", ColumnContent, ColumnContent("Buzz Accuracy ⬆️", "number")])
17
+ auto_eval_column_dict.append(["win_rate_human", ColumnContent, ColumnContent("Win Rate (Human Teams)", "number")])
18
+ auto_eval_column_dict.append(["win_rate_model", ColumnContent, ColumnContent("Win Rate (Model Teams)", "number")])
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
20
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils_old.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from about import Tasks
7
+
8
+
9
+ def fields(raw_class):
10
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
+
12
+
13
+ # These classes are for user facing column names,
14
+ # to avoid having to change them all around the code
15
+ # when a modif is needed
16
+ @dataclass
17
+ class ColumnContent:
18
+ name: str
19
+ type: str
20
+ displayed_by_default: bool
21
+ hidden: bool = False
22
+ never_hidden: bool = False
23
+
24
+
25
+ ## Leaderboard columns
26
+ auto_eval_column_dict = []
27
+ # Init
28
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ # Scores
31
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
32
+ for task in Tasks:
33
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
34
+ # Model information
35
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
37
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
38
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
39
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
40
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
41
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
42
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
43
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
44
+
45
+ # We use make dataclass to dynamically fill the scores from Tasks
46
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
47
+
48
+
49
+ ## For the queue columns in the submission tab
50
+ @dataclass(frozen=True)
51
+ class EvalQueueColumn: # Queue column
52
+ model = ColumnContent("model", "markdown", True)
53
+ revision = ColumnContent("revision", "str", True)
54
+ private = ColumnContent("private", "bool", True)
55
+ precision = ColumnContent("precision", "str", True)
56
+ weight_type = ColumnContent("weight_type", "str", "Original")
57
+ status = ColumnContent("status", "str", True)
58
+
59
+
60
+ ## All the model information that we might need
61
+ @dataclass
62
+ class ModelDetails:
63
+ name: str
64
+ display_name: str = ""
65
+ symbol: str = "" # emoji
66
+
67
+
68
+ class ModelType(Enum):
69
+ PT = ModelDetails(name="pretrained", symbol="🟢")
70
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
71
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
72
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
73
+ Unknown = ModelDetails(name="", symbol="?")
74
+
75
+ def to_str(self, separator=" "):
76
+ return f"{self.value.symbol}{separator}{self.value.name}"
77
+
78
+ @staticmethod
79
+ def from_str(type):
80
+ if "fine-tuned" in type or "🔶" in type:
81
+ return ModelType.FT
82
+ if "pretrained" in type or "🟢" in type:
83
+ return ModelType.PT
84
+ if "RL-tuned" in type or "🟦" in type:
85
+ return ModelType.RL
86
+ if "instruction-tuned" in type or "⭕" in type:
87
+ return ModelType.IFT
88
+ return ModelType.Unknown
89
+
90
+
91
+ class WeightType(Enum):
92
+ Adapter = ModelDetails("Adapter")
93
+ Original = ModelDetails("Original")
94
+ Delta = ModelDetails("Delta")
95
+
96
+
97
+ class Precision(Enum):
98
+ float16 = ModelDetails("float16")
99
+ bfloat16 = ModelDetails("bfloat16")
100
+ Unknown = ModelDetails("?")
101
+
102
+ def from_str(precision):
103
+ if precision in ["torch.float16", "float16"]:
104
+ return Precision.float16
105
+ if precision in ["torch.bfloat16", "bfloat16"]:
106
+ return Precision.bfloat16
107
+ return Precision.Unknown
108
+
109
+
110
+ # Column selection
111
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
112
+
113
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
114
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
115
+
116
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
src/envs.py CHANGED
@@ -9,22 +9,22 @@ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
9
  ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
10
  COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
11
 
12
- OWNER = (
13
- "umdclip" # Change to your org - don't forget to create a results and request dataset, with the correct format!
14
- )
15
- # ----------------------------------
16
 
17
  REPO_ID = f"{OWNER}/quizbowl-submission"
18
  QUEUE_REPO = f"{OWNER}/advcal-requests"
19
- RESULTS_REPO = f"{OWNER}/advcal-results"
20
 
21
  EXAMPLES_PATH = "examples"
22
 
23
  PLAYGROUND_DATASET_NAMES = {
24
- "tossup": "umdclip/acf-co24-tossups",
25
- "bonus": "umdclip/acf-co24-bonuses",
26
  }
27
 
 
 
28
  # If you setup a cache later, just change HF_HOME
29
  CACHE_PATH = os.getenv("HF_HOME", ".")
30
 
@@ -35,4 +35,7 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
35
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
36
 
37
 
 
 
 
38
  API = HfApi(token=TOKEN)
 
9
  ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
10
  COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
11
 
12
+ # Change to your org - don't forget to create a results and request dataset, with the correct format!
13
+ OWNER = "umdclip"
 
 
14
 
15
  REPO_ID = f"{OWNER}/quizbowl-submission"
16
  QUEUE_REPO = f"{OWNER}/advcal-requests"
17
+ RESULTS_REPO = f"{OWNER}/model-results" # TODO: change to advcal-results after testing is done
18
 
19
  EXAMPLES_PATH = "examples"
20
 
21
  PLAYGROUND_DATASET_NAMES = {
22
+ "tossup": f"{OWNER}/acf-co24-tossups",
23
+ "bonus": f"{OWNER}/acf-co24-bonuses",
24
  }
25
 
26
+ # ----------------------------------
27
+
28
  # If you setup a cache later, just change HF_HOME
29
  CACHE_PATH = os.getenv("HF_HOME", ".")
30
 
 
35
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
36
 
37
 
38
+ SERVER_REFRESH_INTERVAL = 86400 # seconds (one day)
39
+ LEADERBOARD_REFRESH_INTERVAL = 600 # seconds (10 minutes)
40
+
41
  API = HfApi(token=TOKEN)
src/leaderboard/__init__.py ADDED
File without changes
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ from display.formatting import make_clickable_model
11
+ from display.utils_old import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
12
+ from submission.check_validity import is_model_on_hub
13
+
14
+
15
+ @dataclass
16
+ class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
18
+
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
+ model: str
23
+ revision: str # commit hash, "" if main
24
+ results: dict
25
+ precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
+ license: str = "?"
30
+ likes: int = 0
31
+ num_params: int = 0
32
+ date: str = "" # submission date of request file
33
+ still_on_hub: bool = False
34
+
35
+ @classmethod
36
+ def init_from_json_file(self, json_filepath):
37
+ """Inits the result from the specific model result file"""
38
+ with open(json_filepath) as fp:
39
+ data = json.load(fp)
40
+
41
+ config = data.get("config")
42
+
43
+ # Precision
44
+ precision = Precision.from_str(config.get("model_dtype"))
45
+
46
+ # Get model and org
47
+ org_and_model = config.get("model_name", config.get("model_args", None))
48
+ org_and_model = org_and_model.split("/", 1)
49
+
50
+ if len(org_and_model) == 1:
51
+ org = None
52
+ model = org_and_model[0]
53
+ result_key = f"{model}_{precision.value.name}"
54
+ else:
55
+ org = org_and_model[0]
56
+ model = org_and_model[1]
57
+ result_key = f"{org}_{model}_{precision.value.name}"
58
+ full_model = "/".join(org_and_model)
59
+
60
+ still_on_hub, _, model_config = is_model_on_hub(
61
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
+ )
63
+ architecture = "?"
64
+ if model_config is not None:
65
+ architectures = getattr(model_config, "architectures", None)
66
+ if architectures:
67
+ architecture = ";".join(architectures)
68
+
69
+ # Extract results available in this file (some results are split in several files)
70
+ results = {}
71
+ for task in Tasks:
72
+ task = task.value
73
+
74
+ # We average all scores of a given metric (not all metrics are present in all files)
75
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
+ if accs.size == 0 or any([acc is None for acc in accs]):
77
+ continue
78
+
79
+ mean_acc = np.mean(accs) * 100.0
80
+ results[task.benchmark] = mean_acc
81
+
82
+ return self(
83
+ eval_name=result_key,
84
+ full_model=full_model,
85
+ org=org,
86
+ model=model,
87
+ results=results,
88
+ precision=precision,
89
+ revision=config.get("model_sha", ""),
90
+ still_on_hub=still_on_hub,
91
+ architecture=architecture,
92
+ )
93
+
94
+ def update_with_request_file(self, requests_path):
95
+ """Finds the relevant request file for the current model and updates info with it"""
96
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
+
98
+ try:
99
+ with open(request_file, "r") as f:
100
+ request = json.load(f)
101
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
102
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
103
+ self.license = request.get("license", "?")
104
+ self.likes = request.get("likes", 0)
105
+ self.num_params = request.get("params", 0)
106
+ self.date = request.get("submitted_time", "")
107
+ except Exception:
108
+ print(
109
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
110
+ )
111
+
112
+ def to_dict(self):
113
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
114
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
115
+ data_dict = {
116
+ "eval_name": self.eval_name, # not a column, just a save name,
117
+ AutoEvalColumn.precision.name: self.precision.value.name,
118
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
119
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
120
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
121
+ AutoEvalColumn.architecture.name: self.architecture,
122
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
123
+ AutoEvalColumn.revision.name: self.revision,
124
+ AutoEvalColumn.average.name: average,
125
+ AutoEvalColumn.license.name: self.license,
126
+ AutoEvalColumn.likes.name: self.likes,
127
+ AutoEvalColumn.params.name: self.num_params,
128
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
129
+ }
130
+
131
+ for task in Tasks:
132
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
133
+
134
+ return data_dict
135
+
136
+
137
+ def get_request_file_for_model(requests_path, model_name, precision):
138
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
139
+ request_files = os.path.join(
140
+ requests_path,
141
+ f"{model_name}_eval_request_*.json",
142
+ )
143
+ request_files = glob.glob(request_files)
144
+
145
+ # Select correct request file (precision)
146
+ request_file = ""
147
+ request_files = sorted(request_files, reverse=True)
148
+ for tmp_request_file in request_files:
149
+ with open(tmp_request_file, "r") as f:
150
+ req_content = json.load(f)
151
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
152
+ request_file = tmp_request_file
153
+ return request_file
154
+
155
+
156
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
157
+ """From the path of the results folder root, extract all needed info for results"""
158
+ model_result_filepaths = []
159
+
160
+ for root, _, files in os.walk(results_path):
161
+ # We should only have json files in model results
162
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
163
+ continue
164
+
165
+ # Sort the files by date
166
+ try:
167
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
168
+ except dateutil.parser._parser.ParserError:
169
+ files = [files[-1]]
170
+
171
+ for file in files:
172
+ model_result_filepaths.append(os.path.join(root, file))
173
+
174
+ eval_results = {}
175
+ for model_result_filepath in model_result_filepaths:
176
+ # Creation of result
177
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
178
+ eval_result.update_with_request_file(requests_path)
179
+
180
+ # Store results of same eval together
181
+ eval_name = eval_result.eval_name
182
+ if eval_name in eval_results.keys():
183
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
184
+ else:
185
+ eval_results[eval_name] = eval_result
186
+
187
+ results = []
188
+ for v in eval_results.values():
189
+ try:
190
+ v.to_dict() # we test if the dict version is complete
191
+ results.append(v)
192
+ except KeyError: # not all eval values present
193
+ continue
194
+
195
+ return results
src/populate.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from display.formatting import make_clickable_model
7
+ from display.utils_old import EvalQueueColumn
8
+
9
+
10
+ def get_leaderboard_df(results_path: str) -> pd.DataFrame:
11
+ model_result_filepaths = []
12
+ for root, _, files in os.walk(results_path):
13
+ if len(files) == 0 or not all(f.endswith(".json") for f in files):
14
+ continue
15
+ for file in files:
16
+ model_result_filepaths.append(os.path.join(root, file))
17
+
18
+ eval_results = {"model": [], "buzz_accuracy": [], "win_rate_human": [], "win_rate_model": []}
19
+ for model_result_filepath in model_result_filepaths:
20
+ with open(model_result_filepath, "r") as fin:
21
+ model_result = json.load(fin)
22
+ model_id = model_result["model_id"]
23
+ buzz_accuracy = model_result["buzz_accuracy"]
24
+ win_rate_human = model_result["win_rate_human"]
25
+ win_rate_model = model_result["win_rate_model"]
26
+ eval_results["model"].append(model_id)
27
+ eval_results["buzz_accuracy"].append(buzz_accuracy)
28
+ eval_results["win_rate_human"].append(win_rate_human)
29
+ eval_results["win_rate_model"].append(win_rate_model)
30
+ return pd.DataFrame(eval_results)
31
+
32
+
33
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
34
+ # TODO: This function is stale, but might be a good reference point for new implementation
35
+ """Creates the different dataframes for the evaluation queues requestes"""
36
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
37
+ all_evals = []
38
+
39
+ for entry in entries:
40
+ if ".json" in entry:
41
+ file_path = os.path.join(save_path, entry)
42
+ with open(file_path) as fp:
43
+ data = json.load(fp)
44
+
45
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
46
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
47
+
48
+ all_evals.append(data)
49
+ elif ".md" not in entry:
50
+ # this is a folder
51
+ sub_entries = [
52
+ e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
53
+ ]
54
+ for sub_entry in sub_entries:
55
+ file_path = os.path.join(save_path, entry, sub_entry)
56
+ with open(file_path) as fp:
57
+ data = json.load(fp)
58
+
59
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
60
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
61
+ all_evals.append(data)
62
+
63
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
64
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
65
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
66
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
67
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
68
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
69
+ return df_finished[cols], df_running[cols], df_pending[cols]