facat commited on
Commit
3a8c0d0
1 Parent(s): 9827786

!ref suite

Browse files
Files changed (3) hide show
  1. tasks.py +10 -19
  2. tlem.py +55 -28
  3. utils.py +3 -3
tasks.py CHANGED
@@ -65,6 +65,7 @@ class Task:
65
  few_shot: int = 0
66
  few_shot_from: Optional[str] = None
67
  # results: dict[str, Any] = field(default_factory=dict)
 
68
 
69
  def __post_init__(self):
70
  names = (
@@ -142,31 +143,21 @@ class Task:
142
  )
143
  return metric
144
 
 
 
 
 
 
 
 
145
  # @cache
146
  def run(
147
  self,
148
  pipeline,
149
  ):
150
- if (outputs := pipeline(self.samples)) is None:
151
- logging.warning("pipeline returns None")
152
- return
153
- self.outputs = outputs
154
- try:
155
- try:
156
- result = self.metric._compute(
157
- responses=outputs, references=self.dataset[self.label_column]
158
- )
159
- except Exception as e:
160
- result = self.metric.compute(
161
- responses=outputs, references=self.dataset[self.label_column]
162
- )
163
- except Exception as e:
164
- result = outputs
165
- # if log:
166
- # name = name or pipeline.__name__
167
- # self.results[name] = result
168
 
169
- return result
170
 
171
 
172
  def multichoice(responses: Any, references: list[str]):
 
65
  few_shot: int = 0
66
  few_shot_from: Optional[str] = None
67
  # results: dict[str, Any] = field(default_factory=dict)
68
+ outputs: Optional[list] = field(default_factory=list)
69
 
70
  def __post_init__(self):
71
  names = (
 
143
  )
144
  return metric
145
 
146
+ @cached_property
147
+ def result(self) -> dict:
148
+ assert self.outputs, "Please run the task first."
149
+ return self.metric._compute(
150
+ responses=self.outputs, references=self.dataset[self.label_column]
151
+ )
152
+
153
  # @cache
154
  def run(
155
  self,
156
  pipeline,
157
  ):
158
+ self.outputs = self.outputs or pipeline(self.samples)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ return self.result
161
 
162
 
163
  def multichoice(responses: Any, references: list[str]):
tlem.py CHANGED
@@ -12,6 +12,7 @@ import datasets
12
  import pandas as pd
13
  from .tasks import *
14
  from .utils import *
 
15
 
16
 
17
  class ReasoningMetric(evaluate.Metric):
@@ -70,33 +71,35 @@ class ReasoningMetric(evaluate.Metric):
70
  class Suite(EvaluationSuite):
71
  task_class = Task
72
 
 
 
 
 
 
 
 
73
  def run(
74
  self,
75
  model_or_pipeline: Any,
 
76
  ) -> dict[str, float]:
77
  self.assert_suite_nonempty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- def run_tasks(tasks):
80
- for task in (bar := tqdm(tasks, leave=False)):
81
- bar.desc = f"complete {task.name}."
82
- if task.name not in self.cached_result:
83
- self.cached_result[task.name] = task.run(model_or_pipeline)
84
- results = [self.cached_result[task.name] for task in tasks]
85
- return pd.DataFrame(results).mean().to_dict()
86
-
87
- if isinstance(self.suite, dict):
88
- for category, tasks in (bar := tqdm(self.suite.items())):
89
- bar.desc = f"complete {category}."
90
- logging.warning(f"Combined results {category}: {run_tasks(tasks)}")
91
- else:
92
- logging.warning(f"Combined results: {run_tasks(self.suite)}")
93
-
94
- return self.cached_result
95
-
96
- def add(self, name):
97
- self.load(name)
98
-
99
- def load(self, name):
100
  chat = False
101
  match name:
102
  case _ if "chat" in name:
@@ -106,6 +109,8 @@ class Suite(EvaluationSuite):
106
  suite = MMLU.suite(chat=chat)
107
  case _ if name.startswith("cmmlu"):
108
  suite = CMMLU.suite(chat=chat)
 
 
109
  case "gsm8k":
110
  suite = Task(
111
  dataset_name=("gsm8k", "main"),
@@ -123,8 +128,7 @@ class Suite(EvaluationSuite):
123
  suite = DROP.suite()
124
  case "winogrande":
125
  suite = Winogrande.suite()
126
- case _ if name.startswith("ceval"):
127
- suite = CEVAL.suite(chat=chat)
128
  case "mt_bench":
129
  suite = Task(
130
  dataset_name="SUSTech/mt_bench_judge",
@@ -135,16 +139,39 @@ class Suite(EvaluationSuite):
135
  case "MATH" | "competition_math":
136
  suite = Task(
137
  dataset_name="hendrycks/competition_math",
138
- split="test",
139
- prompt="This is a math problem, please think step by step and slove it: {input_column}, simplify your final answer as much as possible and surround them with $ in TeX form",
140
  metric_name=("sustech/tlem", "MATH"),
141
  input_column="problem",
142
  label_column="solution",
143
  )
 
 
 
 
 
 
144
 
145
- self.suite = [suite] if isinstance(suite, Task) else suite
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  def __init__(self, name="tlem"):
148
  super().__init__(name)
149
- self.cached_result = {}
150
- self.suite = []
 
12
  import pandas as pd
13
  from .tasks import *
14
  from .utils import *
15
+ from itertools import chain
16
 
17
 
18
  class ReasoningMetric(evaluate.Metric):
 
71
  class Suite(EvaluationSuite):
72
  task_class = Task
73
 
74
+ def __getitem__(self, key) -> Task:
75
+ match key:
76
+ case str():
77
+ return self.suite[key]
78
+ # case _:
79
+ # return list(chain(*self.suite.values()))[key]
80
+
81
  def run(
82
  self,
83
  model_or_pipeline: Any,
84
+ suite=None,
85
  ) -> dict[str, float]:
86
  self.assert_suite_nonempty()
87
+ if suite is None:
88
+ suite = self.suite
89
+
90
+ self.suite: dict[str, list[Task]]
91
+ results = defaultdict(dict)
92
+ for category, tasks in (bar := tqdm(self.suite.items())):
93
+ bar.desc = f"complete {category}."
94
+ if isinstance(tasks, dict):
95
+ results[category] = self.run(model_or_pipeline, tasks)
96
+ else:
97
+ for task in tasks:
98
+ results[category].update(task.run(model_or_pipeline))
99
+ results[category] = np.mean(list(results[category].values()))
100
+ return results
101
 
102
+ def get_suite(self, name) -> dict[str, Task]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  chat = False
104
  match name:
105
  case _ if "chat" in name:
 
109
  suite = MMLU.suite(chat=chat)
110
  case _ if name.startswith("cmmlu"):
111
  suite = CMMLU.suite(chat=chat)
112
+ case _ if name.startswith("ceval"):
113
+ suite = CEVAL.suite(chat=chat)
114
  case "gsm8k":
115
  suite = Task(
116
  dataset_name=("gsm8k", "main"),
 
128
  suite = DROP.suite()
129
  case "winogrande":
130
  suite = Winogrande.suite()
131
+
 
132
  case "mt_bench":
133
  suite = Task(
134
  dataset_name="SUSTech/mt_bench_judge",
 
139
  case "MATH" | "competition_math":
140
  suite = Task(
141
  dataset_name="hendrycks/competition_math",
142
+ prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form",
 
143
  metric_name=("sustech/tlem", "MATH"),
144
  input_column="problem",
145
  label_column="solution",
146
  )
147
+ if isinstance(suite, Task):
148
+ suite = [suite]
149
+ if isinstance(suite, list):
150
+ suite = {name: suite}
151
+
152
+ return suite
153
 
154
+ def singleton(self, task):
155
+ try:
156
+ return self.tasks[self.tasks.index(task)]
157
+
158
+ except Exception as e:
159
+ self.tasks.append(task)
160
+ return self.tasks[-1]
161
+
162
+ def drop_duplicates(self, suite):
163
+ for category, tasks in suite.items():
164
+ if isinstance(tasks, dict):
165
+ suite[category] = self.drop_duplicates(tasks)
166
+ else:
167
+ suite[category] = [self.singleton(task) for task in tasks]
168
+ return suite
169
+
170
+ def load(self, name):
171
+ self.suite.update(self.get_suite(name))
172
+ self.suite = self.drop_duplicates(self.suite)
173
 
174
  def __init__(self, name="tlem"):
175
  super().__init__(name)
176
+ self.tasks = []
177
+ self.suite = {}
utils.py CHANGED
@@ -138,13 +138,13 @@ def extract_numeric(string, pattern=NUMERIC_IN_EN) -> str:
138
  def remove_boxed(s):
139
  if "\\boxed " in s:
140
  left = "\\boxed "
141
- assert s[: len(left)] == left
142
  return s[len(left) :]
143
 
144
  left = "\\boxed{"
145
 
146
- assert s[: len(left)] == left
147
- assert s[-1] == "}"
148
 
149
  return s[len(left) : -1]
150
 
 
138
  def remove_boxed(s):
139
  if "\\boxed " in s:
140
  left = "\\boxed "
141
+ assert s[: len(left)] == left, s
142
  return s[len(left) :]
143
 
144
  left = "\\boxed{"
145
 
146
+ assert s[: len(left)] == left, s
147
+ assert s[-1] == "}", s
148
 
149
  return s[len(left) : -1]
150