lixuejing commited on
Commit
d7938a7
·
1 Parent(s): 25c2ffd

update task

Browse files
Files changed (3) hide show
  1. src/about.py +10 -10
  2. src/display/utils.py +20 -20
  3. src/leaderboard/read_evals.py +7 -0
src/about.py CHANGED
@@ -12,16 +12,16 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- cmmmu = Task("cmmmu", "acc", "CMMMU")
16
- cmmu = Task("cmmu", "acc", "CMMU")
17
- cv_bench = Task("cv_bench", "acc", "CV_Bench")
18
- hallusion_bench = Task("hallusion_bench", "acc", "Hallusion_Bench")
19
- mmmu = Task("mmmu", "acc", "MMMU")
20
- mmmu_pro_standard = Task("mmmu_pro_standard", "acc", "MMMU_Pro_Standard")
21
- mmmu_pro_vision = Task("mmmu_pro_vision", "acc", "MMMU_Pro_Vision")
22
- ocrbench = Task("ocrbench", "acc", "OCRBench")
23
- math_vision = Task("math_vision", "acc", "Math_Vision")
24
- ciibench = Task("ciibench", "acc", "CIIBench")
25
 
26
  NUM_FEWSHOT = 0 # Change with your few shot
27
  # ---------------------------------------------------
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ CMMMU = Task("CMMMU", "acc", "CMMMU")
16
+ CMMU = Task("CMMU", "acc", "CMMU")
17
+ ChartQA = Task('ChartQA',"acc", "ChartQA")
18
+ MMMU = Task("MMMU", "acc", "MMMU")
19
+ OCRBench = Task("OCRBench", "acc", "OCRBench")
20
+ MMMU_Pro_standard = Task("MMMU_Pro_standard", "acc", "MMMU_Pro_standard")
21
+ MMMU_Pro_vision = Task("MMMU_Pro_vision", "acc", "MMMU_Pro_vision")
22
+ MathVision = Task("MathVision", "acc", "MathVision")
23
+ CII_Bench = Task("CII-Bench", "acc", "CII-Bench")
24
+ Blink = Task("Blink", "acc", "Blink")
25
 
26
  NUM_FEWSHOT = 0 # Change with your few shot
27
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -144,16 +144,16 @@ baseline_row = {
144
  AutoEvalColumn.precision.name: None,
145
  AutoEvalColumn.average.name: 92.75,
146
  AutoEvalColumn.merged.name: False,
147
- AutoEvalColumn.cmmmu.name: 100,
148
- AutoEvalColumn.cmmu.name: 100,
149
- AutoEvalColumn.cv_bench.name: 100,
150
- AutoEvalColumn.hallusion_bench.name: 100,
151
- AutoEvalColumn.mmmu.name: 100,
152
- AutoEvalColumn.mmmu_pro_standard.name: 100,
153
- AutoEvalColumn.mmmu_pro_vision.name: 100,
154
- AutoEvalColumn.ocrbench.name: 100,
155
- AutoEvalColumn.math_vision.name: 100,
156
- AutoEvalColumn.ciibench.name: 100,
157
  AutoEvalColumn.dummy.name: "baseline",
158
  AutoEvalColumn.model_type.name: "",
159
  AutoEvalColumn.flagged.name: False,
@@ -166,16 +166,16 @@ human_baseline_row = {
166
  AutoEvalColumn.precision.name: None,
167
  AutoEvalColumn.average.name: 92.75,
168
  AutoEvalColumn.merged.name: False,
169
- AutoEvalColumn.cmmmu.name: 100,
170
- AutoEvalColumn.cmmu.name: 100,
171
- AutoEvalColumn.cv_bench.name: 100,
172
- AutoEvalColumn.hallusion_bench.name: 100,
173
- AutoEvalColumn.mmmu.name: 100,
174
- AutoEvalColumn.mmmu_pro_standard.name: 100,
175
- AutoEvalColumn.mmmu_pro_vision.name: 100,
176
- AutoEvalColumn.ocrbench.name: 100,
177
- AutoEvalColumn.math_vision.name: 100,
178
- AutoEvalColumn.ciibench.name: 100,
179
  AutoEvalColumn.dummy.name: "human_baseline",
180
  AutoEvalColumn.model_type.name: "",
181
  AutoEvalColumn.flagged.name: False,
 
144
  AutoEvalColumn.precision.name: None,
145
  AutoEvalColumn.average.name: 92.75,
146
  AutoEvalColumn.merged.name: False,
147
+ AutoEvalColumn.CMMMU.name: 100,
148
+ AutoEvalColumn.CMMU.name: 100,
149
+ AutoEvalColumn.ChartQA.name: 100,
150
+ AutoEvalColumn.MMMU.name: 100,
151
+ AutoEvalColumn.MMMU_Pro_standard.name: 100,
152
+ AutoEvalColumn.MMMU_Pro_vision.name: 100,
153
+ AutoEvalColumn.OCRBench.name: 100,
154
+ AutoEvalColumn.MathVision.name: 100,
155
+ AutoEvalColumn.CII_Bench.name: 100,
156
+ AutoEvalColumn.Blink.name: 100,
157
  AutoEvalColumn.dummy.name: "baseline",
158
  AutoEvalColumn.model_type.name: "",
159
  AutoEvalColumn.flagged.name: False,
 
166
  AutoEvalColumn.precision.name: None,
167
  AutoEvalColumn.average.name: 92.75,
168
  AutoEvalColumn.merged.name: False,
169
+ AutoEvalColumn.CMMMU.name: 100,
170
+ AutoEvalColumn.CMMU.name: 100,
171
+ AutoEvalColumn.ChartQA.name: 100,
172
+ AutoEvalColumn.MMMU.name: 100,
173
+ AutoEvalColumn.MMMU_Pro_standard.name: 100,
174
+ AutoEvalColumn.MMMU_Pro_vision.name: 100,
175
+ AutoEvalColumn.OCRBench.name: 100,
176
+ AutoEvalColumn.MathVision.name: 100,
177
+ AutoEvalColumn.CII_Bench.name: 100,
178
+ AutoEvalColumn.Blink.name: 100,
179
  AutoEvalColumn.dummy.name: "human_baseline",
180
  AutoEvalColumn.model_type.name: "",
181
  AutoEvalColumn.flagged.name: False,
src/leaderboard/read_evals.py CHANGED
@@ -114,6 +114,13 @@ class EvalResult:
114
  self.status = "FAILED"
115
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
116
 
 
 
 
 
 
 
 
117
  def to_dict(self):
118
  """Converts the Eval Result to a dict compatible with our dataframe display"""
119
  average = 0
 
114
  self.status = "FAILED"
115
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
116
 
117
+ def update_with_dynamic_file_dict(self, file_dict):
118
+ self.license = file_dict.get("license", "?")
119
+ self.likes = file_dict.get("likes", 0)
120
+ self.still_on_hub = file_dict["still_on_hub"]
121
+ self.flagged = any("flagged" in tag for tag in file_dict["tags"])
122
+ self.tags = file_dict["tags"]
123
+
124
  def to_dict(self):
125
  """Converts the Eval Result to a dict compatible with our dataframe display"""
126
  average = 0