Clémentine commited on
Commit
741edbf
1 Parent(s): 5d94f6d

updated app

Browse files
Files changed (2) hide show
  1. app.py +48 -51
  2. requirements.txt +1 -1
app.py CHANGED
@@ -15,34 +15,29 @@ from huggingface_hub import HfApi
15
  from scorer import question_scorer
16
  from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
17
 
18
- BALM_TOKEN = os.environ.get("BALM_TOKEN", None)
19
 
20
  OWNER="gaia-benchmark"
21
  DATA_DATASET = f"{OWNER}/GAIA"
22
- SUBMISSION_DATASET = f"{OWNER}/submissions"
 
23
  RESULTS_DATASET = f"{OWNER}/results"
24
  LEADERBOARD_PATH = f"{OWNER}/leaderboard"
25
-
26
- SPLIT="validation" #Change to test once we are ready to go
27
  api = HfApi()
28
 
 
 
29
  os.makedirs("scored", exist_ok=True)
30
 
31
  # Display the results
32
- eval_results = {}
33
- for level in range(1, 4):
34
- eval_results[level] = load_dataset(RESULTS_DATASET, f"2023_level{level}", use_auth_token=BALM_TOKEN, split=SPLIT)
35
-
36
-
37
- eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
38
- eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
39
- eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
40
 
41
  # Gold answers
42
  gold_results = {}
43
- for level in range(1, 4):
44
- level_dataset = load_dataset(DATA_DATASET, f"2023_level{level}", split=SPLIT, use_auth_token=BALM_TOKEN)
45
- gold_results[level] = {row["task_id"]: row["ground_truth"] for row in level_dataset}
46
 
47
 
48
  def restart_space():
@@ -53,14 +48,12 @@ COLS = ["Model", "Score ⬆️", "Organisation"]
53
  TYPES = ["str", "number", "str",]
54
 
55
  def add_new_eval(
56
- level_of_dev: str,
57
  model: str,
58
  path_to_file,
59
  organisation: str,
60
  mail: str,
61
  ):
62
- level = int(level_of_dev.split(" ")[-1])
63
-
64
  # Very basic email parsing
65
  _, parsed_mail = parseaddr(mail)
66
  if not "@" in parsed_mail:
@@ -69,21 +62,25 @@ def add_new_eval(
69
  print("Adding new eval")
70
 
71
  # Check if the combination model/org already exists and prints a warning message if yes
72
- if model.lower() in set(eval_results[level]["model"]) and organisation.lower() in set(eval_results[level]["organisation"]):
73
  return format_warning("This model has been already submitted.")
 
 
 
74
 
75
  # Save submitted file
76
  api.upload_file(
77
  repo_id=SUBMISSION_DATASET,
78
  path_or_fileobj=path_to_file.name,
79
- path_in_repo=f"{organisation}/{model}/level{level}_raw_{datetime.datetime.today()}.jsonl",
80
  repo_type="dataset",
81
  token=BALM_TOKEN
82
  )
83
 
84
  # Compute score
85
  file_path = path_to_file.name
86
- total_score = 0
 
87
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
88
  with open(file_path, 'r') as f:
89
  for line in f:
@@ -93,24 +90,29 @@ def add_new_eval(
93
  raise Exception("No model_answer key in the file provided")
94
  answer = task["model_answer"]
95
  task_id = task["task_id"]
96
-
97
- score = question_scorer(task['model_answer'], gold_results[level][task_id])
 
98
 
99
  scored_file.write(
100
  json.dumps({
101
  "id": task_id,
102
  "model_answer": answer,
103
- "score": score
 
104
  }) + "\n"
105
  )
106
 
107
- total_score += score
 
 
 
108
 
109
  # Save scored file
110
  api.upload_file(
111
  repo_id=SUBMISSION_DATASET,
112
  path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
113
- path_in_repo=f"{organisation}/{model}/level{level}_scored_{datetime.datetime.today()}.jsonl",
114
  repo_type="dataset",
115
  token=BALM_TOKEN
116
  )
@@ -118,25 +120,25 @@ def add_new_eval(
118
  # Actual submission
119
  eval_entry = {
120
  "model": model,
121
- "score": total_score,
122
  "organisation": organisation,
123
  "mail": mail,
 
 
 
 
124
  }
125
- eval_results[level] = eval_results[level].add_item(eval_entry)
126
- # TODO: change split to "test" once we have the actual results
127
- eval_results[level].push_to_hub(f"{OWNER}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split=SPLIT)
128
 
129
  return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed")
130
 
131
 
132
  def refresh():
133
- eval_results = {}
134
- for level in range(1, 4):
135
- eval_results[level] = load_dataset(f"{OWNER}/BALM_ResultsLevel{level}", use_auth_token=BALM_TOKEN, split=SPLIT)
136
- eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
137
- eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
138
- eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
139
- return eval_dataframe_1, eval_dataframe_2, eval_dataframe_3
140
 
141
  def upload_file(files):
142
  file_paths = [file.name for file in files]
@@ -156,17 +158,13 @@ with demo:
156
  elem_id="citation-button",
157
  ).style(show_copy_button=True)
158
 
159
- with gr.Tab("Results: Level 1"):
160
- leaderboard_table_1 = gr.components.Dataframe(
161
- value=eval_dataframe_1, headers=COLS, datatype=TYPES, interactive=False,
162
- )
163
- with gr.Tab("Results: Level 2"):
164
- leaderboard_table_2 = gr.components.Dataframe(
165
- value=eval_dataframe_2, headers=COLS, datatype=TYPES, interactive=False,
166
  )
167
- with gr.Tab("Results: Level 3"):
168
- leaderboard_table_3 = gr.components.Dataframe(
169
- value=eval_dataframe_3, headers=COLS, datatype=TYPES, interactive=False,
170
  )
171
 
172
  refresh_button = gr.Button("Refresh")
@@ -174,15 +172,14 @@ with demo:
174
  refresh,
175
  inputs=[],
176
  outputs=[
177
- leaderboard_table_1,
178
- leaderboard_table_2,
179
- leaderboard_table_3,
180
  ],
181
  )
182
  with gr.Accordion("Submit a new model for evaluation"):
183
  with gr.Row():
184
  with gr.Column():
185
- level_of_test = gr.Radio(["Level 1", "Level 2", "Level 3"], value="Level 1", label="{split} set level")
186
  model_name_textbox = gr.Textbox(label="Model name")
187
  file_output = gr.File()
188
  with gr.Column():
 
15
  from scorer import question_scorer
16
  from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
17
 
18
+ BALM_TOKEN = os.environ.get("WTOKEN", None)
19
 
20
  OWNER="gaia-benchmark"
21
  DATA_DATASET = f"{OWNER}/GAIA"
22
+ INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
23
+ SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
24
  RESULTS_DATASET = f"{OWNER}/results"
25
  LEADERBOARD_PATH = f"{OWNER}/leaderboard"
 
 
26
  api = HfApi()
27
 
28
+ YEAR_VERSION = "2023"
29
+
30
  os.makedirs("scored", exist_ok=True)
31
 
32
  # Display the results
33
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=BALM_TOKEN)
34
+ eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
35
+ eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
 
 
 
 
 
36
 
37
  # Gold answers
38
  gold_results = {}
39
+ gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", use_auth_token=BALM_TOKEN)
40
+ gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
 
41
 
42
 
43
  def restart_space():
 
48
  TYPES = ["str", "number", "str",]
49
 
50
  def add_new_eval(
51
+ val_or_test: str,
52
  model: str,
53
  path_to_file,
54
  organisation: str,
55
  mail: str,
56
  ):
 
 
57
  # Very basic email parsing
58
  _, parsed_mail = parseaddr(mail)
59
  if not "@" in parsed_mail:
 
62
  print("Adding new eval")
63
 
64
  # Check if the combination model/org already exists and prints a warning message if yes
65
+ if model.lower() in set(eval_results[val_or_test]["model"]) and organisation.lower() in set(eval_results[val_or_test]["organisation"]):
66
  return format_warning("This model has been already submitted.")
67
+
68
+ if path_to_file is None:
69
+ return format_warning("Please attach a file.")
70
 
71
  # Save submitted file
72
  api.upload_file(
73
  repo_id=SUBMISSION_DATASET,
74
  path_or_fileobj=path_to_file.name,
75
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
76
  repo_type="dataset",
77
  token=BALM_TOKEN
78
  )
79
 
80
  # Compute score
81
  file_path = path_to_file.name
82
+ scores = {"all": 0, 1: 0, 2: 0, 3: 0}
83
+ num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
84
  with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
85
  with open(file_path, 'r') as f:
86
  for line in f:
 
90
  raise Exception("No model_answer key in the file provided")
91
  answer = task["model_answer"]
92
  task_id = task["task_id"]
93
+ level = int(gold_results[val_or_test][task_id]["Level"])
94
+
95
+ score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
96
 
97
  scored_file.write(
98
  json.dumps({
99
  "id": task_id,
100
  "model_answer": answer,
101
+ "score": score,
102
+ "level": level
103
  }) + "\n"
104
  )
105
 
106
+ scores["all"] += score
107
+ scores[level] += score
108
+ num_questions["all"] += 1
109
+ num_questions[level] += 1
110
 
111
  # Save scored file
112
  api.upload_file(
113
  repo_id=SUBMISSION_DATASET,
114
  path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
115
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
116
  repo_type="dataset",
117
  token=BALM_TOKEN
118
  )
 
120
  # Actual submission
121
  eval_entry = {
122
  "model": model,
 
123
  "organisation": organisation,
124
  "mail": mail,
125
+ "score": scores["all"]/num_questions["all"],
126
+ "score_level1": scores[1]/num_questions[1],
127
+ "score_level2": scores[2]/num_questions[2],
128
+ "score_level3": scores[3]/num_questions[3],
129
  }
130
+ eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
131
+ print(eval_results)
132
+ eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=BALM_TOKEN)
133
 
134
  return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed")
135
 
136
 
137
  def refresh():
138
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=BALM_TOKEN, download_mode="force_redownload")
139
+ eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
140
+ eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
141
+ return eval_dataframe_val, eval_dataframe_test
 
 
 
142
 
143
  def upload_file(files):
144
  file_paths = [file.name for file in files]
 
158
  elem_id="citation-button",
159
  ).style(show_copy_button=True)
160
 
161
+ with gr.Tab("Results: Validation"):
162
+ leaderboard_table_val = gr.components.Dataframe(
163
+ value=eval_dataframe_val, headers=COLS, datatype=TYPES, interactive=False,
 
 
 
 
164
  )
165
+ with gr.Tab("Results: Test"):
166
+ leaderboard_table_test = gr.components.Dataframe(
167
+ value=eval_dataframe_test, headers=COLS, datatype=TYPES, interactive=False,
168
  )
169
 
170
  refresh_button = gr.Button("Refresh")
 
172
  refresh,
173
  inputs=[],
174
  outputs=[
175
+ leaderboard_table_val,
176
+ leaderboard_table_test,
 
177
  ],
178
  )
179
  with gr.Accordion("Submit a new model for evaluation"):
180
  with gr.Row():
181
  with gr.Column():
182
+ level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
183
  model_name_textbox = gr.Textbox(label="Model name")
184
  file_output = gr.File()
185
  with gr.Column():
requirements.txt CHANGED
@@ -18,7 +18,7 @@ filelock==3.11.0
18
  fonttools==4.39.3
19
  frozenlist==1.3.3
20
  fsspec==2023.4.0
21
- datasets
22
  gradio==3.27.0
23
  gradio_client==0.1.3
24
  h11==0.14.0
 
18
  fonttools==4.39.3
19
  frozenlist==1.3.3
20
  fsspec==2023.4.0
21
+ datasets==2.14.5
22
  gradio==3.27.0
23
  gradio_client==0.1.3
24
  h11==0.14.0