Patrick Haller commited on
Commit
983ff7e
Β·
1 Parent(s): 51a9af1

Init leaderboard

Browse files
Files changed (3) hide show
  1. app.py +73 -0
  2. current_results.json +53 -0
  3. text.py +26 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import json
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ from text import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, TITLE_TEXT, TASK_DESCRIPTION
8
+
9
+ # with open("app.css") as f:
10
+ # css_code = f.read()
11
+
12
+ demo = gr.Blocks()
13
+
14
+ with open("current_results.json") as f:
15
+ result_list = json.load(f)
16
+
17
+ df = pd.DataFrame(result_list)
18
+
19
+ df["Model"] = df.apply(lambda x: f"<a style='text-decoration: underline' href='{x['link']}'>{x['Model']}</a>" if isinstance(x["link"], str) else x["Model"], axis=1)
20
+
21
+ # Sort columns by aoc_original, aoc_leet, euler_original, euler_story
22
+ df = df[["Model", "instruction_only", "aoc_original", "aoc_leet", "euler_original", "euler_story"]]
23
+
24
+ df["instruction_only"] = df["instruction_only"].map({True: 1, False: 0})
25
+
26
+
27
+ average_scores = df.iloc[:, 2:].mean(axis=1).round(2)
28
+
29
+ # Replace Column names
30
+ df.columns = ["Model", "Evaluation", "AOC Original",
31
+ "AOC Leet", "Euler Original", "Euler Story"]
32
+
33
+ average_scores = df.iloc[:, 2:].mean(axis=1).round(2)
34
+ df.insert(loc=2, column="⬆️ Average", value=average_scores)
35
+ df = df.sort_values(by=["Evaluation", "⬆️ Average"], ascending=[True, False])
36
+ df["Evaluation"] = df["Evaluation"].map({1: "πŸ”Ά", 0: "🟩"})
37
+
38
+ with demo:
39
+ gr.HTML(f"<h2 style='text-align: center'>{TITLE_TEXT}</h2>")
40
+ # gr.HTML('<hr>')
41
+ gr.HTML(f"<h3>{INTRODUCTION_TEXT}<h3>")
42
+ gr.HTML('<hr style="border-top: 3px dotted #bbb" class="dotted">')
43
+
44
+ gr.HTML("<h3>πŸ“Š Results</h3>")
45
+ gr.components.Dataframe(
46
+ value=df,
47
+ datatype=["html"]
48
+
49
+ )
50
+
51
+ gr.HTML("<h3>Legend</h3>")
52
+ gr.HTML("<p>πŸ”Ά: Evaluated only on the first part of each AoC day</p>")
53
+ gr.HTML("<p>🟩: Complete Evaluation</p>")
54
+
55
+ # with gr.Row():
56
+ # with gr.Accordion("Task", open=True):
57
+ # with gr.Row():
58
+ # with gr.Column(scale=1):
59
+ # gr.Image("assets/front.png")
60
+ # with gr.Column(scale=4):
61
+ # gr.Markdown(TASK_DESCRIPTION)
62
+
63
+ with gr.Row():
64
+ with gr.Accordion("πŸ“™ Citation", open=False):
65
+ citation_button = gr.Textbox(
66
+ value=CITATION_BUTTON_TEXT,
67
+ label=CITATION_BUTTON_LABEL,
68
+ lines=20,
69
+ elem_id="citation-button",
70
+ show_copy_button=True,
71
+ )
72
+
73
+ demo.launch()
current_results.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Model": "Mistral-7B-Instruct-v0.1",
4
+ "euler_original": 0.37,
5
+ "euler_story": 0.12,
6
+ "aoc_original": 3.0,
7
+ "aoc_leet": 3.0,
8
+ "instruction_only": true,
9
+ "link": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1"
10
+ },
11
+ {
12
+ "Model": "Mixtral-8x7B-Instruct-v0.1",
13
+ "euler_original": 2.86,
14
+ "euler_story": 2.23,
15
+ "aoc_original": 8.67,
16
+ "aoc_leet": 8.42,
17
+ "instruction_only": false,
18
+ "link": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"
19
+ },
20
+ {
21
+ "Model": "chat_bison",
22
+ "euler_story": 0.62,
23
+ "euler_original": 2.44,
24
+ "aoc_leet": 13.78,
25
+ "aoc_original": 17.09,
26
+ "instruction_only": false
27
+ },
28
+ {
29
+ "Model": "codechat_bison",
30
+ "euler_story": 2.61,
31
+ "euler_original": 4.59,
32
+ "aoc_original": 21.17,
33
+ "aoc_leet": 17.6,
34
+ "instruction_only": false
35
+ },
36
+ {
37
+ "Model": "WizardCoder-Python-34B-V1.0",
38
+ "aoc_leet": 22.5,
39
+ "aoc_original": 24.0,
40
+ "euler_original": 2.61,
41
+ "euler_story": 2.48,
42
+ "instruction_only": true,
43
+ "link": "https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0"
44
+ },
45
+ {
46
+ "Model": "gpt3.5",
47
+ "euler_original": 8.19,
48
+ "euler_story": 6.95,
49
+ "aoc_leet": 29.85,
50
+ "aoc_original": 50.0,
51
+ "instruction_only": false
52
+ }
53
+ ]
text.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE_TEXT = "PECC - Problem Extraction and Coding Challenges Evaluation Benchmark"
2
+
3
+ INTRODUCTION_TEXT = """πŸ“„ PECC: An extensive benchmark centered on code generation from narrative-embedded problem descriptions. Unlike prior benchmarks that evaluate code generation using specific instructions, our dataset requires models to comprehend, extract requirements, and produce the essential code for problem-solving. This approach necessitates syntactically accurate programs and demands reading comprehension skills to derive the desired solution."""
4
+
5
+ TASK_DESCRIPTION = """## Task Description
6
+ The task for the model is to generate directly executable python code.
7
+
8
+ ### Instruction
9
+ The model is first prompted with a system prompt, which is a short description of the problem. The model is then asked to generate the python code that solves the problem.
10
+
11
+ ### Task
12
+ The model receives the task itself.
13
+ """
14
+
15
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite our paper"
16
+
17
+ CITATION_BUTTON_TEXT = r"""
18
+ @misc{pecc,
19
+ author = {Patrick Haller and Jonas Golde and Alan Akbik},
20
+ title = {PECC - Problem Extraction and Coding Challenges},
21
+ year = {2024},
22
+ publisher = {GitHub},
23
+ journal = {GitHub repository},
24
+ howpublished = {}
25
+ }
26
+ """