Update space
Browse files- src/about.py +4 -5
- src/display/utils.py +20 -20
src/about.py
CHANGED
@@ -57,11 +57,10 @@ TITLE = """<h1 align="center" id="space-title">Decentralized Arena</h1>"""
|
|
57 |
|
58 |
# What does your leaderboard evaluate?
|
59 |
INTRODUCTION_TEXT = """
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
in evaluating others. It achieves a 97\% correlation with Chatbot Arena's overall rankings, while being fully transparent and reproducible.
|
65 |
"""
|
66 |
|
67 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
57 |
|
58 |
# What does your leaderboard evaluate?
|
59 |
INTRODUCTION_TEXT = """
|
60 |
+
Decentralized Arena automates and scales "Chatbot Arena" for LLM evaluation across various fine-grained dimensions
|
61 |
+
(e.g., math – algebra, geometry, probability; logical reasoning, social reasoning, biology, chemistry, …).
|
62 |
+
The evaluation is decentralized and democratic, with all LLMs participating in evaluating others.
|
63 |
+
It achieves a 95\% correlation with Chatbot Arena's overall rankings, while being fully transparent and reproducible.
|
|
|
64 |
"""
|
65 |
|
66 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/display/utils.py
CHANGED
@@ -64,26 +64,26 @@ auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=l
|
|
64 |
auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
|
65 |
|
66 |
# fine-graine dimensions
|
67 |
-
auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Overall", "number", True))])
|
68 |
-
auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Math
|
69 |
-
auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Math
|
70 |
-
auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Math
|
71 |
-
auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Logical Reasoning", "number", True))])
|
72 |
-
auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Social Reasoning", "number", True))])
|
73 |
-
|
74 |
-
auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("
|
75 |
-
auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("
|
76 |
-
auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("
|
77 |
-
auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("
|
78 |
-
auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("
|
79 |
-
auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("
|
80 |
-
|
81 |
-
auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Overall", "number", True))])
|
82 |
-
auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math
|
83 |
-
auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math
|
84 |
-
auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math
|
85 |
-
auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Logical Reasoning", "number", True))])
|
86 |
-
auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Social Reasoning", "number", True))])
|
87 |
|
88 |
|
89 |
for task in Tasks:
|
|
|
64 |
auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
|
65 |
|
66 |
# fine-graine dimensions
|
67 |
+
auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
|
68 |
+
auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
|
69 |
+
auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
|
70 |
+
auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Probability)", "number", True))])
|
71 |
+
auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Logical Reasoning)", "number", True))])
|
72 |
+
auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Social Reasoning)", "number", True))])
|
73 |
+
|
74 |
+
auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev(Overall)", "number", True))])
|
75 |
+
auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Algebra)", "number", True))])
|
76 |
+
auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Geometry)", "number", True))])
|
77 |
+
auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Probability)", "number", True))])
|
78 |
+
auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Logical Reasoning)", "number", True))])
|
79 |
+
auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Social Reasoning)", "number", True))])
|
80 |
+
|
81 |
+
auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Overall)", "number", True))])
|
82 |
+
auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Algebra)", "number", True))])
|
83 |
+
auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Geometry)", "number", True))])
|
84 |
+
auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Probability)", "number", True))])
|
85 |
+
auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
|
86 |
+
auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
|
87 |
|
88 |
|
89 |
for task in Tasks:
|