de-arena / app.py
yzabc007's picture
update
785b751
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns, SearchColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
SUB_TITLE,
EXTERNAL_LINKS,
COMING_SOON_TEXT
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_leaderboard_df
from src.submission.submit import add_new_eval
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=None,
# SelectColumns(
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
# label="Select Columns to Display:",
# ),
# search_columns=None,
# search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
placeholder="Search by the model name",
label="Searching"),
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=None,
# [
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
# ColumnFilter(
# AutoEvalColumn.params.name,
# type="slider",
# min=0.01,
# max=150,
# label="Select the number of parameters (B)",
# ),
# ColumnFilter(
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
# ),
# ],
# bool_checkboxgroup_label="Hide models",
interactive=False,
)
# model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json"
# model_result_path = "./src/results/models_2024-10-24-08:08:59.127307.json"
model_result_path = "./src/results/models_2024-11-08-08:36:00.464224.json"
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
def overall_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=None,
search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
placeholder="Search by the model name",
label="Searching"),
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=None,
interactive=False,
)
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">🏅 Decentralized Arena Leaderboard</h1>"""
SUB_TITLE = """<h2 align="center" id="space-subtitle">Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions</h2>"""
# <a href="https://github.com/maitrix-org/de-arena" target="_blank">GitHub</a> |
EXTERNAL_LINKS = """
<h2 align="center" id="space-links">
<a href="https://de-arena.maitrix.org/" target="_blank">Blog</a> |
<a href="https://de-arena.maitrix.org/images/Heading.mp4" target="">Video</a> |
<a href="https://maitrix.org/" target="_blank">@Maitrix.org</a> |
<a href="https://www.llm360.ai/" target="_blank">@LLM360</a>
</h2>
"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
**Decentralized Arena** automates and scales "Chatbot Arena" for LLM evaluation across various fine-grained dimensions
(e.g., math – algebra, geometry, probability; logical reasoning, social reasoning, biology, chemistry, …).
The evaluation is decentralized and democratic, with all LLMs participating in evaluating others.
It achieves a 95\% correlation with Chatbot Arena's overall rankings, while being fully transparent and reproducible.
"""
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.HTML(SUB_TITLE)
gr.HTML(EXTERNAL_LINKS)
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# gr.HTML('<p style="font-size:15px;">This is a larger text using HTML in Markdown.</p>')
INTRODUCTION_TEXT_FONT_SIZE = 16
INTRODUCTION_TEXT = (
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
'<strong>Decentralized Arena</strong> automates, scales, and accelerates <a href="https://lmarena.ai/">Chatbot Arena</a> '
'for large language model (LLM) evaluation across diverse, fine-grained dimensions, '
'such as mathematics (algebra, geometry, probability), logical reasoning, social reasoning, science (chemistry, physics, biology), or any user-defined dimensions. '
'The evaluation is decentralized and democratic, with all participating LLMs assessing each other to ensure unbiased and fair results. '
'With a 95% correlation to Chatbot Arena\'s overall rankings, the system is fully transparent and reproducible.'
'</p>'
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
'We actively invite <b>model developers</b> to participate and expedite their benchmarking efforts '
'and encourage <b>data stakeholders</b> to freely define and evaluate dimensions of interest for their own objectives.'
'</p>'
)
gr.HTML(INTRODUCTION_TEXT)
'''
TEXT = (
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
''
'</p>'
)
gr.HTML(TEXT)
'''
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
# DESCRIPTION_TEXT = """
# Total #models: 57 (Last updated: 2024-10-21)
# This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
# (Missing values are due to the slow or problemtic model responses to be fixed soom.)
# """
# gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
TEXT = (
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
# '<b>Total #models: 57 (Last updated: 2024-10-21)</b>'
'<b>Total #models: 62 (Last updated: 2024-11-08)</b>'
'</p>'
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
'This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks or scores.'
'(Missing values are due to the slow or problemtic model responses to be fixed soom.)'
'</p>'
# '<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
# 'We present '
# '</p>'
)
gr.HTML(TEXT)
with gr.TabItem("⭐ Sort by Rank", elem_id="overall_sort_by_rank_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
# AutoEvalColumn.rank_overall.name,
AutoEvalColumn.model.name,
AutoEvalColumn.rank_math_algebra.name,
AutoEvalColumn.rank_math_geometry.name,
AutoEvalColumn.rank_math_probability.name,
AutoEvalColumn.rank_reason_logical.name,
AutoEvalColumn.rank_reason_social.name,
AutoEvalColumn.rank_chemistry.name,
AutoEvalColumn.rank_biology.name,
AutoEvalColumn.rank_physics.name,
AutoEvalColumn.rank_overall.name,
# AutoEvalColumn.rank_cpp.name,
],
rank_col=['sort_by_rank', 1, 8],
)
)
with gr.TabItem("⭐ Sort by Score", elem_id="overall_sort_by_score_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
# AutoEvalColumn.rank_overall.name,
AutoEvalColumn.model.name,
# AutoEvalColumn.license.name,
# AutoEvalColumn.organization.name,
# AutoEvalColumn.knowledge_cutoff.name,
AutoEvalColumn.score_math_algebra.name,
AutoEvalColumn.score_math_geometry.name,
AutoEvalColumn.score_math_probability.name,
AutoEvalColumn.score_reason_logical.name,
AutoEvalColumn.score_reason_social.name,
AutoEvalColumn.score_chemistry.name,
AutoEvalColumn.score_biology.name,
AutoEvalColumn.score_physics.name,
AutoEvalColumn.score_overall.name,
# AutoEvalColumn.score_cpp.name,
# AutoEvalColumn.rank_overall.name,
# AutoEvalColumn.rank_math_algebra.name,
# AutoEvalColumn.rank_math_geometry.name,
# AutoEvalColumn.rank_math_probability.name,
# AutoEvalColumn.rank_reason_logical.name,
# AutoEvalColumn.rank_reason_social.name,
# AutoEvalColumn.rank_chemistry.name,
# AutoEvalColumn.rank_cpp.name,
],
rank_col=['sort_by_score', 1, 8],
)
)
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
# DESCRIPTION_TEXT="""
# Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
# To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
# We prioritize **recent math datasets** and focus on **college and beyond level** math questions.
# The current datasets include
# [MATH](https://arxiv.org/abs/2103.03874),
# [MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
# [Omni](https://omni-math.github.io/),
# [MathQA](https://arxiv.org/abs/1905.13319),
# [MathBench](https://arxiv.org/abs/2405.12209),
# [SciBench](https://arxiv.org/abs/2307.10635), and more!
# We plan to include more math domains, such as calculus, number theory, and more in the future.
# """
# gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
TEXT = (
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
'Algebra, Geometry, and Probability are the current three main math domains in the leaderboard. '
'To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources. '
'We prioritize <b>recent math datasets</b> and focus on <b>college and beyond level</b> math questions. '
'The current datasets include</b>'
'<a href="https://arxiv.org/abs/2103.03874">MATH</a>, '
'<a href="htt ps://github.com/openai/prm800k/tree/main/prm800k/math_splits">MATH-500</a>, '
'<a href="https://omni-math.github.io/">Omni</a>, '
'<a href="https://arxiv.org/abs/1905.13319">MathQA</a>, '
'<a href="https://arxiv.org/abs/2405.12209">MathBench</a>, '
'<a href="https://arxiv.org/abs/2307.10635">SciBench</a>, and more! '
'</p>'
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
'We plan to include more math domains, such as calculus, number theory, and more in the future. '
'</p>'
# '<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
# 'We present '
# '</p>'
)
gr.HTML(TEXT)
# leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("🏆 Overview", elem_id="math_overview_subtab", id=0, elem_classes="subtab"):
with gr.TabItem("⭐ Sort by Rank", elem_id="math_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.model.name,
# AutoEvalColumn.license.name,
# AutoEvalColumn.organization.name,
# AutoEvalColumn.knowledge_cutoff.name,
# AutoEvalColumn.score_math_algebra.name,
# AutoEvalColumn.score_math_geometry.name,
# AutoEvalColumn.score_math_probability.name,
AutoEvalColumn.rank_math_algebra.name,
AutoEvalColumn.rank_math_geometry.name,
AutoEvalColumn.rank_math_probability.name,
],
rank_col=['sort_by_rank', 1, 4, 'Math'],
)
)
with gr.TabItem("⭐ Sort by Score", elem_id="math_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.model.name,
# AutoEvalColumn.license.name,
# AutoEvalColumn.organization.name,
# AutoEvalColumn.knowledge_cutoff.name,
AutoEvalColumn.score_math_algebra.name,
AutoEvalColumn.score_math_geometry.name,
AutoEvalColumn.score_math_probability.name,
# AutoEvalColumn.rank_math_algebra.name,
# AutoEvalColumn.rank_math_geometry.name,
# AutoEvalColumn.rank_math_probability.name,
],
rank_col=['sort_by_score', 1, 4, 'Math'],
)
)
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_math_algebra.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_math_algebra.name,
# AutoEvalColumn.sd_math_algebra.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_math_algebra.name],
)
)
with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=2, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_math_geometry.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_math_geometry.name,
# AutoEvalColumn.sd_math_geometry.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_math_geometry.name],
)
)
with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=3, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_math_probability.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_math_probability.name,
# AutoEvalColumn.sd_math_probability.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_math_probability.name],
)
)
# with gr.TabItem("Sort_by_rank", elem_id="math_sort_by_rank_subtab", id=4, elem_classes="subtab"):
# leaderboard = overall_leaderboard(
# get_model_leaderboard_df(
# model_result_path,
# benchmark_cols=[
# AutoEvalColumn.model.name,
# AutoEvalColumn.rank_math_algebra.name,
# AutoEvalColumn.rank_math_geometry.name,
# AutoEvalColumn.rank_math_probability.name,
# ],
# rank_col=[],
# )
# )
with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
DESCRIPTION_TEXT = """
Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
We now present two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance.
For logical reasoning, we leverage datasets from sources such as
[BIG-Bench Hard (BBH)](https://arxiv.org/abs/2210.09261),
[FOLIO](https://arxiv.org/abs/2209.00840),
[LogiQA2.0](https://github.com/csitfun/LogiQA2.0),
[PrOntoQA](https://arxiv.org/abs/2210.01240),
[ReClor](https://arxiv.org/abs/2002.04326),
These cover a range of tasks including deductive reasoning, object counting and tracking, pattern recognition,
temporal reasoning, first-order logic reaosning, etc.
For social reasoning, we collect datasets from
[MMToM-QA (Text-only)](https://arxiv.org/abs/2401.08743),
[BigToM](https://arxiv.org/abs/2306.15448),
[Adv-CSFB](https://arxiv.org/abs/2305.14763),
[SocialIQA](https://arxiv.org/abs/1904.09728),
[NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
More fine-grained types of reasoning, such as symbolic, analogical, counterfactual reasoning, are planned to be added in the future.
"""
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
with gr.TabItem("🏆 Overview", elem_id="reasoning_overview_subtab", id=0, elem_classes="subtab"):
with gr.TabItem("⭐ Sort by Rank", elem_id="reasoning_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.model.name,
# AutoEvalColumn.license.name,
# AutoEvalColumn.organization.name,
# AutoEvalColumn.knowledge_cutoff.name,
AutoEvalColumn.rank_reason_logical.name,
AutoEvalColumn.rank_reason_social.name,
],
rank_col=['sort_by_rank', 1, 3, 'Reasoning'],
)
)
with gr.TabItem("⭐ Sort by Score", elem_id="reasoning_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.model.name,
# AutoEvalColumn.license.name,
# AutoEvalColumn.organization.name,
# AutoEvalColumn.knowledge_cutoff.name,
AutoEvalColumn.score_reason_logical.name,
AutoEvalColumn.score_reason_social.name,
],
rank_col=['sort_by_score', 1, 3, 'Reasoning'],
)
)
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_reason_logical.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_reason_logical.name,
# AutoEvalColumn.sd_reason_logical.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_reason_logical.name],
)
)
with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=2, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_reason_social.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_reason_social.name,
# AutoEvalColumn.sd_reason_social.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_reason_social.name],
)
)
# with gr.TabItem("Sort_by_rank", elem_id="reasoning_sort_by_rank_subtab", id=3, elem_classes="subtab"):
# leaderboard = overall_leaderboard(
# get_model_leaderboard_df(
# model_result_path,
# benchmark_cols=[
# AutoEvalColumn.model.name,
# AutoEvalColumn.rank_reason_logical.name,
# AutoEvalColumn.rank_reason_social.name,
# ],
# rank_col=[],
# )
# )
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
CURRENT_TEXT = """
Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
We are adding several fine-grained scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
We have diversely and aggressively collected recent scientific datasets, including but not limited to
[GPQA](https://arxiv.org/abs/2311.12022),
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
[MMLU-Pro](https://arxiv.org/abs/2406.01574),
[OlympiadBench](https://arxiv.org/abs/2402.14008),
[SciBench](https://arxiv.org/abs/2307.10635),
[SciEval](https://arxiv.org/abs/2308.13149).
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("🏆 Overview", elem_id="science_overview_subtab", id=0, elem_classes="subtab"):
with gr.TabItem("⭐ Sort by Rank", elem_id="science_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.model.name,
# AutoEvalColumn.license.name,
# AutoEvalColumn.organization.name,
# AutoEvalColumn.knowledge_cutoff.name,
AutoEvalColumn.rank_chemistry.name,
AutoEvalColumn.rank_biology.name,
AutoEvalColumn.rank_physics.name,
],
rank_col=['sort_by_rank', 1, 4, 'Science'],
)
)
with gr.TabItem("⭐ Sort by Score", elem_id="science_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.model.name,
# AutoEvalColumn.license.name,
# AutoEvalColumn.organization.name,
# AutoEvalColumn.knowledge_cutoff.name,
AutoEvalColumn.score_chemistry.name,
AutoEvalColumn.score_biology.name,
AutoEvalColumn.score_physics.name,
],
rank_col=['sort_by_score', 1, 4, 'Science'], # two numbers are index to select the columns to average and sort
)
)
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_chemistry.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_chemistry.name,
# AutoEvalColumn.sd_reason_social.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_chemistry.name],
)
)
with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=3, elem_classes="subtab"):
# CURRENT_TEXT = """
# # Coming soon!
# """
# gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_biology.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_biology.name,
# AutoEvalColumn.sd_reason_social.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_biology.name],
)
)
with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=2, elem_classes="subtab"):
# CURRENT_TEXT = """
# # Coming soon!
# """
# gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_physics.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_physics.name,
# AutoEvalColumn.sd_reason_social.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_physics.name],
)
)
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
CURRENT_TEXT = """
We are working on adding more fine-grained tasks in coding domains to the leaderboard.
The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
We collect a variety of recent coding datasets, including
[HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
[MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp),
[HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack),
[newly crawled LeetCode data](https://leetcode.com/problemset/),
filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more!
Our efforts also include synthesizing new code-related queries to ensure diversity!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("➕ C++", elem_id="cpp_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_cpp.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_cpp.name,
# AutoEvalColumn.sd_cpp.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_cpp.name],
)
)
with gr.TabItem("🐍 Python", elem_id="python_subtab", id=1, elem_classes="subtab"):
CURRENT_TEXT = """
# Coming soon!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("☕ Java", elem_id="java_subtab", id=2, elem_classes="subtab"):
CURRENT_TEXT = """
# Coming soon!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("🎯 Mixed", elem_id="llm-benchmark-tab-table", id=1):
DESCRIPTION_TEXT = """
Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
"""
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_overall.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_overall.name,
# AutoEvalColumn.sd_overall.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_overall.name],
))
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
ABOUT_TEXT = """
# About Us
[Decentralized Arena](https://de-arena.maitrix.org/) is an open-source project that automates and scales the evaluation of large language models (LLMs) across various fine-grained dimensions,
developed by reseachers from UCSD, CMU, MBZUAI, [Maitrix.org](https://maitrix.org/) and [LLM360](https://www.llm360.ai/).
Stay tuned for more updates and new features!
## Team members
Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
[Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), [Jieyuan Liu](https://www.linkedin.com/in/jieyuan-liu/), [Somanshu Singla](https://www.linkedin.com/in/somanshu-singla-105636214/), [Tianyang Liu](https://leolty.github.io/),
[Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
[Zhiting Hu](https://zhiting.ucsd.edu/)
## Contact Us
- Follow us on X, [Maitrix.org](https://twitter.com/MaitrixOrg) and [LLM360](https://twitter.com/llm360)
- Email us at [Zhen Wang](mailto:[email protected]), [Kun Zhou](mailto:[email protected]) and [Zhiting Hu](mailto:[email protected])
"""
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
'''
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Accordion(
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
submission_result,
)
'''
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()