import pandas as pd import gradio as gr import csv import json import os import shutil from huggingface_hub import Repository HF_TOKEN = os.environ.get("HF_TOKEN") SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering", "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"] MODEL_INFO = [ "Models", "Overall", "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering", "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"] DATA_TITLE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] SUBMISSION_NAME = "mmlu_pro_leaderboard_submission" SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME) CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv" COLUMN_NAMES = MODEL_INFO LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard MMLU-Pro dataset, a more robust and challenging massive multi-task understanding dataset tailored to more rigorously benchmark large language models' capabilities. This dataset contains 12K complex questions across various disciplines. The following are the accuracies of various models evaluated on MMLU-Pro. Our dataset is available at [https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro). If you want to reproduce our results or evaluate your own models on MMLU-Pro, please check out our evaluation scripts at [https://github.com/TIGER-AI-Lab/MMLU-Pro](https://github.com/TIGER-AI-Lab/MMLU-Pro). """ TABLE_INTRODUCTION = """ """ LEADERBOARD_INFO = """ We list the information of the used datasets as follows:
""" CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""""" SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction ## ⚠ Please note that you need to submit the json file with following format: ```json { "Model": "[MODEL_NAME]", "Overall": 0.5678, "Biology": 0.1234, "Business": 0.4567, ..., "Other: 0.3456" } ``` After submitting, you can click the "Refresh" button to see the updated leaderboard (it may takes few seconds). """ def get_df(): repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) repo.git_pull() df = pd.read_csv(CSV_DIR) df = df.sort_values(by=['Overall'], ascending=False) return df[COLUMN_NAMES] def add_new_eval( input_file, ): if input_file is None: return "Error! Empty file!" upload_data = json.loads(input_file) print("upload_data:\n", upload_data) data_row = [f'{upload_data["Model"]}', upload_data['Overall']] for subject in SUBJECTS: data_row += [upload_data[subject]] print("data_row:\n", data_row) submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset") submission_repo.git_pull() already_submitted = [] with open(CSV_DIR, mode='r') as file: reader = csv.reader(file, delimiter=',') for row in reader: already_submitted.append(row[0]) if data_row[0] not in already_submitted: with open(CSV_DIR, mode='a', newline='') as file: writer = csv.writer(file) writer.writerow(data_row) submission_repo.push_to_hub() print('Submission Successful') else: print('The entry already exists') def refresh_data(): return get_df()