yuanshengni's picture
Update icon
a38d810
raw
history blame
3.1 kB
MODEL_INFO = [
"Model Name (clickable)",
"TheoremQA",
"MATH",
"GSM",
]
MODEL_INFO_TAB_QUALITY = [
"Model Name (clickable)",
"Quality Score",
"Selected Score"
]
DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number']
CSV_DIR = "./leaderboard/results.csv"
COLUMN_NAMES = MODEL_INFO
LEADERBORAD_INTRODUCTION = """# TheoremQA Leaderboard
*"Which Model is better on STEM QA?"*
πŸ† Welcome to the leaderboard of the **TheoremQA**! 🎦 *A Theorem-driven Question Answering dataset* (**EMNLP 2023**)
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
<a href='https://arxiv.org/abs/2305.12524'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
<a href='https://github.com/TIGER-AI-Lab/TheoremQA'><img src='https://img.shields.io/badge/Github-Repo-grey?logo=github&logoColor=white'></a>
<a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
</div>
We propose the first question-answering dataset driven by STEM theorems. We annotated 800 QA pairs covering 350+ theorems spanning across Math, EE&CS, Physics and Finance. The dataset is collected by human experts with very high quality. We provide the dataset as a new benchmark to test the limit of large language models to apply theorems to solve challenging university-level questions.
Please follow the instructions in [TheoremQA](https://github.com/TIGER-AI-Lab/TheoremQA) to use.
"""
TABLE_INTRODUCTION = """
"""
LEADERBORAD_INFO = """
TheoremQA, a comprehensive benchmark suite for video generative models. We design a comprehensive and hierarchical Evaluation Dimension Suite to decompose "video generation quality" into multiple well-defined dimensions to facilitate fine-grained and objective evaluation. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. For each evaluation dimension, we specifically design an Evaluation Method Suite, which uses carefully crafted method or designated pipeline for automatic objective evaluation. We also conduct Human Preference Annotation for the generated videos for each dimension, and show that TheoremQA evaluation results are well aligned with human perceptions. TheoremQA can provide valuable insights from multiple perspectives.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@inproceedings{chen2023theoremqa,
title={Theoremqa: A theorem-driven question answering dataset},
author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
year={2023}
}"""