import pandas as pd import gradio as gr import csv import json import os import shutil from huggingface_hub import Repository HF_TOKEN = os.environ.get("HF_TOKEN") MODEL_INFO = [ "Model (CoT)", "Avg", "TheoremQA", "MATH", "GSM", ] DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number'] SUBMISSION_NAME = "science_leaderboard_submission" SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME) CSV_DIR = "./science_leaderboard_submission/results.csv" COLUMN_NAMES = MODEL_INFO LEADERBORAD_INTRODUCTION = """# TheoremQA Leaderboard *"Which Model is better on STEM QA?"* 🏆 Welcome to the leaderboard of the **TheoremQA**! 🎦 *A Theorem-driven Question Answering dataset* (**EMNLP 2023**)

We propose the first question-answering dataset driven by STEM theorems. We annotated 800 QA pairs covering 350+ theorems spanning across Math, EE&CS, Physics and Finance. The dataset is collected by human experts with very high quality. We provide the dataset as a new benchmark to test the limit of large language models to apply theorems to solve challenging university-level questions. Please follow the instructions in [TheoremQA](https://github.com/TIGER-AI-Lab/TheoremQA) to use. """ TABLE_INTRODUCTION = """ """ LEADERBORAD_INFO = """ TheoremQA, a comprehensive benchmark suite for video generative models. We design a comprehensive and hierarchical Evaluation Dimension Suite to decompose "video generation quality" into multiple well-defined dimensions to facilitate fine-grained and objective evaluation. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. For each evaluation dimension, we specifically design an Evaluation Method Suite, which uses carefully crafted method or designated pipeline for automatic objective evaluation. We also conduct Human Preference Annotation for the generated videos for each dimension, and show that TheoremQA evaluation results are well aligned with human perceptions. TheoremQA can provide valuable insights from multiple perspectives. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@inproceedings{chen2023theoremqa, title={Theoremqa: A theorem-driven question answering dataset}, author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony}, booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing}, year={2023} }""" SUBMIT_INTRODUCTION = """# Submit on TheoremQA Leaderboard Introduction ## ⚠ Please note that you need to submit the json file with following format: ```json { "Model Name": "Model X", "TheoremQA": 0.5, "MATH": 0.5, "GSM": 0.5 } ``` After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds). """ def get_df(): repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) repo.git_pull() df = pd.read_csv(CSV_DIR) df['Avg'] = df[['TheoremQA', 'MATH', 'GSM']].mean(axis=1).round(1) return df[COLUMN_NAMES] def add_new_eval( input_file, ): if input_file is None: return "Error! Empty file!" upload_data=json.loads(input_file) data_row = [upload_data['ModelName'], upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM']] submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset") submission_repo.git_pull() with open(CSV_DIR, mode='a', newline='') as file: writer = csv.writer(file) writer.writerow(data_row) submission_repo.push_to_hub() def refresh_data(): return get_df()