Spaces:

bigcode
/

bigcodebench-leaderboard

Running

App Files Files Community

Terry Zhuo commited on Jun 19, 2024

Commit

6c29798

1 Parent(s): ae7a86d

update

Browse files

Files changed (2) hide show

app.py +19 -11
src/text_content.py +11 -1

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import requests
 from huggingface_hub import HfApi
 from src.css_html import custom_css
-from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
 from src.utils import (
     AutoEvalColumn,
     fields,
@@ -23,11 +23,11 @@ from src.utils import (
 from datasets import load_dataset
 TOKEN = os.environ.get("TOKEN", None)
 api = HfApi(TOKEN)
-df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("complete", ascending=False)
-task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="train").to_pandas()
-model_elo_mle_df = load_dataset("bigcode/bigcodebench-elo-model-with-tie", split="train").to_pandas()
-complete_solve_rate = load_dataset("bigcode/bigcodebench-complete-solve-rate", split="train").to_pandas()
-instruct_solve_rate = load_dataset("bigcode/bigcodebench-instruct-solve-rate", split="train").to_pandas()
 QUEUE_REPO = "bigcode/bigcodebench-requests"
 EVAL_REQUESTS_PATH = "eval-queue"
@@ -248,10 +248,9 @@ with demo:
                         - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
                     - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
                     - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
-                    - `size` is the amount of activated model weight during inference.
                     - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
                     - For more details check the 📝 About section.
-                    - Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
                     """,
                         elem_classes="markdown-text",
                     )
@@ -265,7 +264,7 @@ with demo:
                         with gr.Group():
                             gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
                             model_elo_map = gr.Plot()
-                            demo.load(plot_elo_mle, [gr.Dataframe(model_elo_mle_df, visible=False)], model_elo_map)
                 with gr.TabItem("🧩 Solve Rate", id=2):
                     with gr.Column():
@@ -280,8 +279,17 @@ with demo:
                 with gr.TabItem("📝 About", id=3):
                     gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
-                with gr.TabItem("Submit Results 🚀", id=4):
                     gr.Markdown(SUBMISSION_TEXT_3)
 demo.launch()

 from huggingface_hub import HfApi
 from src.css_html import custom_css
+from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
 from src.utils import (
     AutoEvalColumn,
     fields,
 from datasets import load_dataset
 TOKEN = os.environ.get("TOKEN", None)
 api = HfApi(TOKEN)
+df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("elo_mle", ascending=False)
+task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="task_no_tie").to_pandas()
+bench_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="benchmark_tie").to_pandas()
+complete_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="complete").to_pandas()
+instruct_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="instruct").to_pandas()
 QUEUE_REPO = "bigcode/bigcodebench-requests"
 EVAL_REQUESTS_PATH = "eval-queue"
                         - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
                     - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
                     - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
+                    - `size` (optional) is the amount of activated model weight during inference.
                     - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
                     - For more details check the 📝 About section.
                     """,
                         elem_classes="markdown-text",
                     )
                         with gr.Group():
                             gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
                             model_elo_map = gr.Plot()
+                            demo.load(plot_elo_mle, [gr.Dataframe(bench_elo_mle_df, visible=False)], model_elo_map)
                 with gr.TabItem("🧩 Solve Rate", id=2):
                     with gr.Column():
                 with gr.TabItem("📝 About", id=3):
                     gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
+                with gr.TabItem("Submit/Request Results 🚀", id=4):
                     gr.Markdown(SUBMISSION_TEXT_3)
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False):
+                citation_button = gr.Textbox(
+                    value=CITATION_BUTTON_TEXT,
+                    label=CITATION_BUTTON_LABEL,
+                    lines=20,
+                    elem_id="citation-button",
+                    show_copy_button=True,
+                )
 demo.launch()

src/text_content.py CHANGED Viewed

@@ -122,6 +122,16 @@ To submit your results create a **Pull Request** in the community tab to add the
 The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
 """
 SUBMISSION_TEXT_3="""
-Coming soon...
 """

 The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
 """
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+@article{bigcodebench,
+  title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
+  author={Zhuo, Terry Yue and Vu, Min Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and Brunner, Simon and Gong, Chen and Hoang, Thong and Zebaze, Armel Randy and Hong, Xiaoheng and Li, Wen-Ding and Kaddour, Jean and Xu, Ming and Zhang, Zhihan and Yadav, Prateek and Jain, Naman and Gu, Alex and Cheng, Zhoujun and Liu, Jiawei and Liu, Qian and Wang, Zijian and Lo, David and Hui, Binyuan and Muennighoff, Niklas and Fried, Daniel and Du, Xiaoning and de Vries, Harm and Von Werra, Leandro},
+  year={2024}
+}
+"""
 SUBMISSION_TEXT_3="""
+We welcome the community to request for new models to be added to the leaderboard. Please [submit a PR here](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard 🤗
 """