Terry Zhuo
commited on
Commit
•
6c29798
1
Parent(s):
ae7a86d
update
Browse files- app.py +19 -11
- src/text_content.py +11 -1
app.py
CHANGED
@@ -9,7 +9,7 @@ import requests
|
|
9 |
from huggingface_hub import HfApi
|
10 |
|
11 |
from src.css_html import custom_css
|
12 |
-
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
|
13 |
from src.utils import (
|
14 |
AutoEvalColumn,
|
15 |
fields,
|
@@ -23,11 +23,11 @@ from src.utils import (
|
|
23 |
from datasets import load_dataset
|
24 |
TOKEN = os.environ.get("TOKEN", None)
|
25 |
api = HfApi(TOKEN)
|
26 |
-
df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("
|
27 |
-
task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="
|
28 |
-
|
29 |
-
complete_solve_rate = load_dataset("bigcode/bigcodebench-
|
30 |
-
instruct_solve_rate = load_dataset("bigcode/bigcodebench-
|
31 |
|
32 |
QUEUE_REPO = "bigcode/bigcodebench-requests"
|
33 |
EVAL_REQUESTS_PATH = "eval-queue"
|
@@ -248,10 +248,9 @@ with demo:
|
|
248 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
249 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
250 |
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
|
251 |
-
- `size` is the amount of activated model weight during inference.
|
252 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
253 |
- For more details check the 📝 About section.
|
254 |
-
- Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
|
255 |
""",
|
256 |
elem_classes="markdown-text",
|
257 |
)
|
@@ -265,7 +264,7 @@ with demo:
|
|
265 |
with gr.Group():
|
266 |
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
267 |
model_elo_map = gr.Plot()
|
268 |
-
demo.load(plot_elo_mle, [gr.Dataframe(
|
269 |
|
270 |
with gr.TabItem("🧩 Solve Rate", id=2):
|
271 |
with gr.Column():
|
@@ -280,8 +279,17 @@ with demo:
|
|
280 |
|
281 |
with gr.TabItem("📝 About", id=3):
|
282 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
283 |
-
with gr.TabItem("Submit Results 🚀", id=4):
|
284 |
gr.Markdown(SUBMISSION_TEXT_3)
|
285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
demo.launch()
|
|
|
9 |
from huggingface_hub import HfApi
|
10 |
|
11 |
from src.css_html import custom_css
|
12 |
+
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
|
13 |
from src.utils import (
|
14 |
AutoEvalColumn,
|
15 |
fields,
|
|
|
23 |
from datasets import load_dataset
|
24 |
TOKEN = os.environ.get("TOKEN", None)
|
25 |
api = HfApi(TOKEN)
|
26 |
+
df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values("elo_mle", ascending=False)
|
27 |
+
task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="task_no_tie").to_pandas()
|
28 |
+
bench_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="benchmark_tie").to_pandas()
|
29 |
+
complete_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="complete").to_pandas()
|
30 |
+
instruct_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="instruct").to_pandas()
|
31 |
|
32 |
QUEUE_REPO = "bigcode/bigcodebench-requests"
|
33 |
EVAL_REQUESTS_PATH = "eval-queue"
|
|
|
248 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
249 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
250 |
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
|
251 |
+
- `size` (optional) is the amount of activated model weight during inference.
|
252 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
253 |
- For more details check the 📝 About section.
|
|
|
254 |
""",
|
255 |
elem_classes="markdown-text",
|
256 |
)
|
|
|
264 |
with gr.Group():
|
265 |
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
266 |
model_elo_map = gr.Plot()
|
267 |
+
demo.load(plot_elo_mle, [gr.Dataframe(bench_elo_mle_df, visible=False)], model_elo_map)
|
268 |
|
269 |
with gr.TabItem("🧩 Solve Rate", id=2):
|
270 |
with gr.Column():
|
|
|
279 |
|
280 |
with gr.TabItem("📝 About", id=3):
|
281 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
282 |
+
with gr.TabItem("Submit/Request Results 🚀", id=4):
|
283 |
gr.Markdown(SUBMISSION_TEXT_3)
|
284 |
+
|
285 |
+
with gr.Row():
|
286 |
+
with gr.Accordion("📙 Citation", open=False):
|
287 |
+
citation_button = gr.Textbox(
|
288 |
+
value=CITATION_BUTTON_TEXT,
|
289 |
+
label=CITATION_BUTTON_LABEL,
|
290 |
+
lines=20,
|
291 |
+
elem_id="citation-button",
|
292 |
+
show_copy_button=True,
|
293 |
+
)
|
294 |
|
295 |
demo.launch()
|
src/text_content.py
CHANGED
@@ -122,6 +122,16 @@ To submit your results create a **Pull Request** in the community tab to add the
|
|
122 |
The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
|
123 |
"""
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
SUBMISSION_TEXT_3="""
|
126 |
-
|
127 |
"""
|
|
|
122 |
The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
|
123 |
"""
|
124 |
|
125 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
126 |
+
|
127 |
+
CITATION_BUTTON_TEXT = r"""
|
128 |
+
@article{bigcodebench,
|
129 |
+
title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
|
130 |
+
author={Zhuo, Terry Yue and Vu, Min Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and Brunner, Simon and Gong, Chen and Hoang, Thong and Zebaze, Armel Randy and Hong, Xiaoheng and Li, Wen-Ding and Kaddour, Jean and Xu, Ming and Zhang, Zhihan and Yadav, Prateek and Jain, Naman and Gu, Alex and Cheng, Zhoujun and Liu, Jiawei and Liu, Qian and Wang, Zijian and Lo, David and Hui, Binyuan and Muennighoff, Niklas and Fried, Daniel and Du, Xiaoning and de Vries, Harm and Von Werra, Leandro},
|
131 |
+
year={2024}
|
132 |
+
}
|
133 |
+
"""
|
134 |
+
|
135 |
SUBMISSION_TEXT_3="""
|
136 |
+
We welcome the community to request for new models to be added to the leaderboard. Please [submit a PR here](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard 🤗
|
137 |
"""
|