Spaces:

gaia-benchmark
/

leaderboard

Runtime error

App Files Files Community

Clémentine commited on Oct 20, 2023

Commit

741edbf

1 Parent(s): 5d94f6d

updated app

Browse files

Files changed (2) hide show

app.py +48 -51
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -15,34 +15,29 @@ from huggingface_hub import HfApi
 from scorer import question_scorer
 from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
-BALM_TOKEN = os.environ.get("BALM_TOKEN", None)
 OWNER="gaia-benchmark"
 DATA_DATASET = f"{OWNER}/GAIA"
-SUBMISSION_DATASET = f"{OWNER}/submissions"
 RESULTS_DATASET = f"{OWNER}/results"
 LEADERBOARD_PATH = f"{OWNER}/leaderboard"
-SPLIT="validation" #Change to test once we are ready to go
 api = HfApi()
 os.makedirs("scored", exist_ok=True)
 # Display the results
-eval_results = {}
-for level in range(1, 4):
-    eval_results[level] = load_dataset(RESULTS_DATASET, f"2023_level{level}", use_auth_token=BALM_TOKEN, split=SPLIT)
-eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
-eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
-eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
 # Gold answers
 gold_results = {}
-for level in range(1, 4):
-    level_dataset = load_dataset(DATA_DATASET, f"2023_level{level}", split=SPLIT, use_auth_token=BALM_TOKEN)
-    gold_results[level] = {row["task_id"]: row["ground_truth"] for row in level_dataset}
 def restart_space():
@@ -53,14 +48,12 @@ COLS = ["Model", "Score ⬆️", "Organisation"]
 TYPES = ["str", "number", "str",]
 def add_new_eval(
-    level_of_dev: str,
     model: str,
     path_to_file,
     organisation: str,
     mail: str,
 ):
-    level = int(level_of_dev.split(" ")[-1])
     # Very basic email parsing
     _, parsed_mail = parseaddr(mail)
     if not "@" in parsed_mail:
@@ -69,21 +62,25 @@ def add_new_eval(
     print("Adding new eval")
     # Check if the combination model/org already exists and prints a warning message if yes
-    if model.lower() in set(eval_results[level]["model"]) and organisation.lower() in set(eval_results[level]["organisation"]):
         return format_warning("This model has been already submitted.")
     # Save submitted file
     api.upload_file(
         repo_id=SUBMISSION_DATASET,
         path_or_fileobj=path_to_file.name,
-        path_in_repo=f"{organisation}/{model}/level{level}_raw_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=BALM_TOKEN
     )
     # Compute score
     file_path = path_to_file.name
-    total_score = 0
     with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
         with open(file_path, 'r') as f:
             for line in f:
@@ -93,24 +90,29 @@ def add_new_eval(
                     raise Exception("No model_answer key in the file provided")
                 answer = task["model_answer"]
                 task_id = task["task_id"]
-                score = question_scorer(task['model_answer'], gold_results[level][task_id])
                 scored_file.write(
                     json.dumps({
                         "id": task_id,
                         "model_answer": answer,
-                        "score": score
                     }) + "\n"
                 )
-                total_score += score
     # Save scored file
     api.upload_file(
         repo_id=SUBMISSION_DATASET,
         path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
-        path_in_repo=f"{organisation}/{model}/level{level}_scored_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=BALM_TOKEN
     )
@@ -118,25 +120,25 @@ def add_new_eval(
     # Actual submission
     eval_entry = {
         "model": model,
-        "score": total_score,
         "organisation": organisation,
         "mail": mail,
     }
-    eval_results[level] = eval_results[level].add_item(eval_entry)
-    # TODO: change split to "test" once we have the actual results
-    eval_results[level].push_to_hub(f"{OWNER}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split=SPLIT)
     return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed")
 def refresh():
-    eval_results = {}
-    for level in range(1, 4):
-        eval_results[level] = load_dataset(f"{OWNER}/BALM_ResultsLevel{level}", use_auth_token=BALM_TOKEN, split=SPLIT)
-    eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
-    eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
-    eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
-    return eval_dataframe_1, eval_dataframe_2, eval_dataframe_3
 def upload_file(files):
     file_paths = [file.name for file in files]
@@ -156,17 +158,13 @@ with demo:
                 elem_id="citation-button",
             ).style(show_copy_button=True)
-    with gr.Tab("Results: Level 1"):
-        leaderboard_table_1 = gr.components.Dataframe(
-            value=eval_dataframe_1, headers=COLS, datatype=TYPES, interactive=False,
-        )
-    with gr.Tab("Results: Level 2"):
-        leaderboard_table_2 = gr.components.Dataframe(
-            value=eval_dataframe_2, headers=COLS, datatype=TYPES, interactive=False,
         )
-    with gr.Tab("Results: Level 3"):
-        leaderboard_table_3 = gr.components.Dataframe(
-            value=eval_dataframe_3, headers=COLS, datatype=TYPES, interactive=False,
         )
     refresh_button = gr.Button("Refresh")
@@ -174,15 +172,14 @@ with demo:
         refresh,
         inputs=[],
         outputs=[
-            leaderboard_table_1,
-            leaderboard_table_2,
-            leaderboard_table_3,
         ],
     )
     with gr.Accordion("Submit a new model for evaluation"):
         with gr.Row():
             with gr.Column():
-                level_of_test = gr.Radio(["Level 1", "Level 2", "Level 3"], value="Level 1", label="{split} set level")
                 model_name_textbox = gr.Textbox(label="Model name")
                 file_output = gr.File()
             with gr.Column():

 from scorer import question_scorer
 from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
+BALM_TOKEN = os.environ.get("WTOKEN", None)
 OWNER="gaia-benchmark"
 DATA_DATASET = f"{OWNER}/GAIA"
+INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
+SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
 RESULTS_DATASET = f"{OWNER}/results"
 LEADERBOARD_PATH = f"{OWNER}/leaderboard"
 api = HfApi()
+YEAR_VERSION = "2023"
 os.makedirs("scored", exist_ok=True)
 # Display the results
+eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=BALM_TOKEN)
+eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
+eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
 # Gold answers
 gold_results = {}
+gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", use_auth_token=BALM_TOKEN)
+gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
 def restart_space():
 TYPES = ["str", "number", "str",]
 def add_new_eval(
+    val_or_test: str,
     model: str,
     path_to_file,
     organisation: str,
     mail: str,
 ):
     # Very basic email parsing
     _, parsed_mail = parseaddr(mail)
     if not "@" in parsed_mail:
     print("Adding new eval")
     # Check if the combination model/org already exists and prints a warning message if yes
+    if model.lower() in set(eval_results[val_or_test]["model"]) and organisation.lower() in set(eval_results[val_or_test]["organisation"]):
         return format_warning("This model has been already submitted.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
     # Save submitted file
     api.upload_file(
         repo_id=SUBMISSION_DATASET,
         path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=BALM_TOKEN
     )
     # Compute score
     file_path = path_to_file.name
+    scores = {"all": 0, 1: 0, 2: 0, 3: 0}
+    num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
     with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
         with open(file_path, 'r') as f:
             for line in f:
                     raise Exception("No model_answer key in the file provided")
                 answer = task["model_answer"]
                 task_id = task["task_id"]
+                level = int(gold_results[val_or_test][task_id]["Level"])
+                score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
                 scored_file.write(
                     json.dumps({
                         "id": task_id,
                         "model_answer": answer,
+                        "score": score,
+                        "level": level
                     }) + "\n"
                 )
+                scores["all"] += score
+                scores[level] += score
+                num_questions["all"] += 1
+                num_questions[level] += 1
     # Save scored file
     api.upload_file(
         repo_id=SUBMISSION_DATASET,
         path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=BALM_TOKEN
     )
     # Actual submission
     eval_entry = {
         "model": model,
         "organisation": organisation,
         "mail": mail,
+        "score": scores["all"]/num_questions["all"],
+        "score_level1": scores[1]/num_questions[1],
+        "score_level2": scores[2]/num_questions[2],
+        "score_level3": scores[3]/num_questions[3],
     }
+    eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
+    print(eval_results)
+    eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=BALM_TOKEN)
     return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed")
 def refresh():
+    eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=BALM_TOKEN, download_mode="force_redownload")
+    eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
+    eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
+    return eval_dataframe_val, eval_dataframe_test
 def upload_file(files):
     file_paths = [file.name for file in files]
                 elem_id="citation-button",
             ).style(show_copy_button=True)
+    with gr.Tab("Results: Validation"):
+        leaderboard_table_val = gr.components.Dataframe(
+            value=eval_dataframe_val, headers=COLS, datatype=TYPES, interactive=False,
         )
+    with gr.Tab("Results: Test"):
+        leaderboard_table_test = gr.components.Dataframe(
+            value=eval_dataframe_test, headers=COLS, datatype=TYPES, interactive=False,
         )
     refresh_button = gr.Button("Refresh")
         refresh,
         inputs=[],
         outputs=[
+            leaderboard_table_val,
+            leaderboard_table_test,
         ],
     )
     with gr.Accordion("Submit a new model for evaluation"):
         with gr.Row():
             with gr.Column():
+                level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
                 model_name_textbox = gr.Textbox(label="Model name")
                 file_output = gr.File()
             with gr.Column():

requirements.txt CHANGED Viewed

@@ -18,7 +18,7 @@ filelock==3.11.0
 fonttools==4.39.3
 frozenlist==1.3.3
 fsspec==2023.4.0
-datasets
 gradio==3.27.0
 gradio_client==0.1.3
 h11==0.14.0

 fonttools==4.39.3
 frozenlist==1.3.3
 fsspec==2023.4.0
+datasets==2.14.5
 gradio==3.27.0
 gradio_client==0.1.3
 h11==0.14.0