Spaces:
Running
Running
update drop
Browse files
tasks.py
CHANGED
@@ -124,6 +124,7 @@ class Task:
|
|
124 |
shots = shots.map(
|
125 |
lambda example: {
|
126 |
self.input_column: example[self.input_column]
|
|
|
127 |
+ example[self.label_column],
|
128 |
}
|
129 |
)[self.input_column]
|
@@ -193,10 +194,7 @@ def multichoice_zh(responses: Any, references: list[str]):
|
|
193 |
class Metrics:
|
194 |
cmmlu = multichoice_zh
|
195 |
mmlu = multichoice
|
196 |
-
|
197 |
-
def ceval(responses: list[str], answers: list[str | int]):
|
198 |
-
responses = [extract_choice_zh(pred) for pred in responses]
|
199 |
-
return responses, answers
|
200 |
|
201 |
def winogrande(responses: list[str], answers: list[str | int]):
|
202 |
responses = [first_option_postprocess(pred, options="AB") for pred in responses]
|
@@ -221,12 +219,8 @@ class Metrics:
|
|
221 |
return responses, answers
|
222 |
|
223 |
def drop(responses: list[str], answers: list[list]):
|
224 |
-
if len(responses) != len(answers):
|
225 |
-
return {"error": "predictions and references have different " "length"}
|
226 |
-
responses = [general_postprocess(pred) for pred in responses]
|
227 |
-
processed_answers = [[general_postprocess(j) for j in i] for i in answers]
|
228 |
scores = []
|
229 |
-
for pred, ans in zip(responses,
|
230 |
score = np.mean([1 if a in pred else 0 for a in ans])
|
231 |
scores.append(score)
|
232 |
return {"em": np.mean(scores)}
|
@@ -608,29 +602,14 @@ class DROP:
|
|
608 |
input_column = "input"
|
609 |
label_column = "answers"
|
610 |
|
611 |
-
icl_prompt = """\
|
612 |
-
Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
|
613 |
-
Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
|
614 |
-
Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
|
615 |
-
|
616 |
-
Text: Playing in their second straight Thanksgiving game, the Eagles struggled especially on defense, where they were unable to stop the much-hyped Lions offense. The worst of it all was how unproven rookie Eric Rowe was tasked with covering wide receiver Calvin Johnson, leading to Johnson catching 3 touchdowns. Stafford’s five passing touchdowns, including three of them to Johnson was too much for the Eagles to overcome and for the second consecutive time this season, the Eagles gave up 45 points in a game. With the loss, the Eagles drop to 4-7 on the season and 6-1 when playing on Thanksgiving.
|
617 |
-
Question: How many TD passes did Stafford throw other than to Johnson?
|
618 |
-
Anawer: According to the text, Stafford threw 5 TD passes, 3 of which were to Johnson. 5-3=2. So the answer is 2.
|
619 |
-
|
620 |
-
Text: [PROMPT]
|
621 |
-
Question: [QUESTION]
|
622 |
-
Anawer:"""
|
623 |
-
|
624 |
@classmethod
|
625 |
def prompt_drop(cls, example):
|
626 |
-
prompt =
|
627 |
-
"[QUESTION]", example["question"]
|
628 |
-
)
|
629 |
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
|
635 |
@classmethod
|
636 |
def suite(
|
@@ -642,7 +621,8 @@ Anawer:"""
|
|
642 |
input_column=cls.input_column,
|
643 |
label_column=cls.label_column,
|
644 |
prompt=partial(cls.prompt_drop),
|
645 |
-
few_shot=
|
|
|
646 |
split="validation",
|
647 |
)
|
648 |
|
@@ -715,28 +695,17 @@ class ARC:
|
|
715 |
|
716 |
@classmethod
|
717 |
def suite(cls):
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
)
|
727 |
-
suite = defaultdict(list)
|
728 |
-
categories = list(finer_categories.keys())
|
729 |
-
for cate in categories:
|
730 |
-
suite[cate].append(
|
731 |
-
Task(
|
732 |
-
("ai2_arc", cate),
|
733 |
-
metric_name=("sustech/tlem", "arc"),
|
734 |
-
input_column=cls.input_column,
|
735 |
-
label_column=cls.label_column,
|
736 |
-
prompt=partial(cls.prompt_arc),
|
737 |
-
few_shot=0,
|
738 |
-
)
|
739 |
)
|
|
|
|
|
740 |
|
741 |
return suite
|
742 |
|
|
|
124 |
shots = shots.map(
|
125 |
lambda example: {
|
126 |
self.input_column: example[self.input_column]
|
127 |
+
+ "\n"
|
128 |
+ example[self.label_column],
|
129 |
}
|
130 |
)[self.input_column]
|
|
|
194 |
class Metrics:
|
195 |
cmmlu = multichoice_zh
|
196 |
mmlu = multichoice
|
197 |
+
ceval = multichoice_zh
|
|
|
|
|
|
|
198 |
|
199 |
def winogrande(responses: list[str], answers: list[str | int]):
|
200 |
responses = [first_option_postprocess(pred, options="AB") for pred in responses]
|
|
|
219 |
return responses, answers
|
220 |
|
221 |
def drop(responses: list[str], answers: list[list]):
|
|
|
|
|
|
|
|
|
222 |
scores = []
|
223 |
+
for pred, ans in zip(responses, answers):
|
224 |
score = np.mean([1 if a in pred else 0 for a in ans])
|
225 |
scores.append(score)
|
226 |
return {"em": np.mean(scores)}
|
|
|
602 |
input_column = "input"
|
603 |
label_column = "answers"
|
604 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
605 |
@classmethod
|
606 |
def prompt_drop(cls, example):
|
607 |
+
prompt = f"Read the following passage and answer the question.\n\n{example['passage']}\n\nQuestion: {example['question']}"
|
|
|
|
|
608 |
|
609 |
+
return {
|
610 |
+
cls.input_column: prompt,
|
611 |
+
cls.label_column: ",".join(example["answers_spans"]["spans"]),
|
612 |
+
}
|
613 |
|
614 |
@classmethod
|
615 |
def suite(
|
|
|
621 |
input_column=cls.input_column,
|
622 |
label_column=cls.label_column,
|
623 |
prompt=partial(cls.prompt_drop),
|
624 |
+
few_shot=3,
|
625 |
+
few_shot_from="train",
|
626 |
split="validation",
|
627 |
)
|
628 |
|
|
|
695 |
|
696 |
@classmethod
|
697 |
def suite(cls):
|
698 |
+
suite = [
|
699 |
+
Task(
|
700 |
+
("ai2_arc", subset),
|
701 |
+
metric_name=("sustech/tlem", "arc"),
|
702 |
+
input_column=cls.input_column,
|
703 |
+
label_column=cls.label_column,
|
704 |
+
prompt=partial(cls.prompt_arc),
|
705 |
+
few_shot=0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
706 |
)
|
707 |
+
for subset in cls.categories
|
708 |
+
]
|
709 |
|
710 |
return suite
|
711 |
|
tlem.py
CHANGED
@@ -152,7 +152,7 @@ class Suite(EvaluationSuite):
|
|
152 |
case "MATH" | "competition_math":
|
153 |
suite = Task(
|
154 |
dataset_name="hendrycks/competition_math",
|
155 |
-
prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form",
|
156 |
metric_name=("sustech/tlem", "MATH"),
|
157 |
input_column="problem",
|
158 |
label_column="solution",
|
@@ -170,7 +170,20 @@ class Suite(EvaluationSuite):
|
|
170 |
"drop",
|
171 |
]:
|
172 |
suite[name] = self.get_suite(name)
|
173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
if isinstance(suite, Task):
|
175 |
suite = [suite]
|
176 |
if isinstance(suite, list):
|
@@ -187,10 +200,13 @@ class Suite(EvaluationSuite):
|
|
187 |
|
188 |
def drop_duplicates(self, suite):
|
189 |
for category, tasks in suite.items():
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
194 |
return suite
|
195 |
|
196 |
def load(self, name):
|
|
|
152 |
case "MATH" | "competition_math":
|
153 |
suite = Task(
|
154 |
dataset_name="hendrycks/competition_math",
|
155 |
+
prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form.",
|
156 |
metric_name=("sustech/tlem", "MATH"),
|
157 |
input_column="problem",
|
158 |
label_column="solution",
|
|
|
170 |
"drop",
|
171 |
]:
|
172 |
suite[name] = self.get_suite(name)
|
173 |
+
case "tlem":
|
174 |
+
suite = {}
|
175 |
+
for name in [
|
176 |
+
"arc",
|
177 |
+
"hellaswag",
|
178 |
+
"mmlu-chat",
|
179 |
+
"winogrande",
|
180 |
+
"gsm8k",
|
181 |
+
"cmmlu-chat",
|
182 |
+
"ceval-chat",
|
183 |
+
# "truthful_qa",
|
184 |
+
"drop",
|
185 |
+
]:
|
186 |
+
suite[name] = self.get_suite(name)
|
187 |
if isinstance(suite, Task):
|
188 |
suite = [suite]
|
189 |
if isinstance(suite, list):
|
|
|
200 |
|
201 |
def drop_duplicates(self, suite):
|
202 |
for category, tasks in suite.items():
|
203 |
+
match tasks:
|
204 |
+
case list():
|
205 |
+
suite[category] = [self.singleton(task) for task in tasks]
|
206 |
+
case dict():
|
207 |
+
suite[category] = self.drop_duplicates(tasks)
|
208 |
+
case _:
|
209 |
+
raise NotImplementedError
|
210 |
return suite
|
211 |
|
212 |
def load(self, name):
|