lixuejing
commited on
Commit
·
d7938a7
1
Parent(s):
25c2ffd
update task
Browse files- src/about.py +10 -10
- src/display/utils.py +20 -20
- src/leaderboard/read_evals.py +7 -0
src/about.py
CHANGED
@@ -12,16 +12,16 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
|
26 |
NUM_FEWSHOT = 0 # Change with your few shot
|
27 |
# ---------------------------------------------------
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
CMMMU = Task("CMMMU", "acc", "CMMMU")
|
16 |
+
CMMU = Task("CMMU", "acc", "CMMU")
|
17 |
+
ChartQA = Task('ChartQA',"acc", "ChartQA")
|
18 |
+
MMMU = Task("MMMU", "acc", "MMMU")
|
19 |
+
OCRBench = Task("OCRBench", "acc", "OCRBench")
|
20 |
+
MMMU_Pro_standard = Task("MMMU_Pro_standard", "acc", "MMMU_Pro_standard")
|
21 |
+
MMMU_Pro_vision = Task("MMMU_Pro_vision", "acc", "MMMU_Pro_vision")
|
22 |
+
MathVision = Task("MathVision", "acc", "MathVision")
|
23 |
+
CII_Bench = Task("CII-Bench", "acc", "CII-Bench")
|
24 |
+
Blink = Task("Blink", "acc", "Blink")
|
25 |
|
26 |
NUM_FEWSHOT = 0 # Change with your few shot
|
27 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -144,16 +144,16 @@ baseline_row = {
|
|
144 |
AutoEvalColumn.precision.name: None,
|
145 |
AutoEvalColumn.average.name: 92.75,
|
146 |
AutoEvalColumn.merged.name: False,
|
147 |
-
AutoEvalColumn.
|
148 |
-
AutoEvalColumn.
|
149 |
-
AutoEvalColumn.
|
150 |
-
AutoEvalColumn.
|
151 |
-
AutoEvalColumn.
|
152 |
-
AutoEvalColumn.
|
153 |
-
AutoEvalColumn.
|
154 |
-
AutoEvalColumn.
|
155 |
-
AutoEvalColumn.
|
156 |
-
AutoEvalColumn.
|
157 |
AutoEvalColumn.dummy.name: "baseline",
|
158 |
AutoEvalColumn.model_type.name: "",
|
159 |
AutoEvalColumn.flagged.name: False,
|
@@ -166,16 +166,16 @@ human_baseline_row = {
|
|
166 |
AutoEvalColumn.precision.name: None,
|
167 |
AutoEvalColumn.average.name: 92.75,
|
168 |
AutoEvalColumn.merged.name: False,
|
169 |
-
AutoEvalColumn.
|
170 |
-
AutoEvalColumn.
|
171 |
-
AutoEvalColumn.
|
172 |
-
AutoEvalColumn.
|
173 |
-
AutoEvalColumn.
|
174 |
-
AutoEvalColumn.
|
175 |
-
AutoEvalColumn.
|
176 |
-
AutoEvalColumn.
|
177 |
-
AutoEvalColumn.
|
178 |
-
AutoEvalColumn.
|
179 |
AutoEvalColumn.dummy.name: "human_baseline",
|
180 |
AutoEvalColumn.model_type.name: "",
|
181 |
AutoEvalColumn.flagged.name: False,
|
|
|
144 |
AutoEvalColumn.precision.name: None,
|
145 |
AutoEvalColumn.average.name: 92.75,
|
146 |
AutoEvalColumn.merged.name: False,
|
147 |
+
AutoEvalColumn.CMMMU.name: 100,
|
148 |
+
AutoEvalColumn.CMMU.name: 100,
|
149 |
+
AutoEvalColumn.ChartQA.name: 100,
|
150 |
+
AutoEvalColumn.MMMU.name: 100,
|
151 |
+
AutoEvalColumn.MMMU_Pro_standard.name: 100,
|
152 |
+
AutoEvalColumn.MMMU_Pro_vision.name: 100,
|
153 |
+
AutoEvalColumn.OCRBench.name: 100,
|
154 |
+
AutoEvalColumn.MathVision.name: 100,
|
155 |
+
AutoEvalColumn.CII_Bench.name: 100,
|
156 |
+
AutoEvalColumn.Blink.name: 100,
|
157 |
AutoEvalColumn.dummy.name: "baseline",
|
158 |
AutoEvalColumn.model_type.name: "",
|
159 |
AutoEvalColumn.flagged.name: False,
|
|
|
166 |
AutoEvalColumn.precision.name: None,
|
167 |
AutoEvalColumn.average.name: 92.75,
|
168 |
AutoEvalColumn.merged.name: False,
|
169 |
+
AutoEvalColumn.CMMMU.name: 100,
|
170 |
+
AutoEvalColumn.CMMU.name: 100,
|
171 |
+
AutoEvalColumn.ChartQA.name: 100,
|
172 |
+
AutoEvalColumn.MMMU.name: 100,
|
173 |
+
AutoEvalColumn.MMMU_Pro_standard.name: 100,
|
174 |
+
AutoEvalColumn.MMMU_Pro_vision.name: 100,
|
175 |
+
AutoEvalColumn.OCRBench.name: 100,
|
176 |
+
AutoEvalColumn.MathVision.name: 100,
|
177 |
+
AutoEvalColumn.CII_Bench.name: 100,
|
178 |
+
AutoEvalColumn.Blink.name: 100,
|
179 |
AutoEvalColumn.dummy.name: "human_baseline",
|
180 |
AutoEvalColumn.model_type.name: "",
|
181 |
AutoEvalColumn.flagged.name: False,
|
src/leaderboard/read_evals.py
CHANGED
@@ -114,6 +114,13 @@ class EvalResult:
|
|
114 |
self.status = "FAILED"
|
115 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
def to_dict(self):
|
118 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
119 |
average = 0
|
|
|
114 |
self.status = "FAILED"
|
115 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
116 |
|
117 |
+
def update_with_dynamic_file_dict(self, file_dict):
|
118 |
+
self.license = file_dict.get("license", "?")
|
119 |
+
self.likes = file_dict.get("likes", 0)
|
120 |
+
self.still_on_hub = file_dict["still_on_hub"]
|
121 |
+
self.flagged = any("flagged" in tag for tag in file_dict["tags"])
|
122 |
+
self.tags = file_dict["tags"]
|
123 |
+
|
124 |
def to_dict(self):
|
125 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
126 |
average = 0
|