facat commited on
Commit
5ca9a91
1 Parent(s): 132574a

update drop

Browse files
Files changed (2) hide show
  1. tasks.py +20 -51
  2. tlem.py +22 -6
tasks.py CHANGED
@@ -124,6 +124,7 @@ class Task:
124
  shots = shots.map(
125
  lambda example: {
126
  self.input_column: example[self.input_column]
 
127
  + example[self.label_column],
128
  }
129
  )[self.input_column]
@@ -193,10 +194,7 @@ def multichoice_zh(responses: Any, references: list[str]):
193
  class Metrics:
194
  cmmlu = multichoice_zh
195
  mmlu = multichoice
196
-
197
- def ceval(responses: list[str], answers: list[str | int]):
198
- responses = [extract_choice_zh(pred) for pred in responses]
199
- return responses, answers
200
 
201
  def winogrande(responses: list[str], answers: list[str | int]):
202
  responses = [first_option_postprocess(pred, options="AB") for pred in responses]
@@ -221,12 +219,8 @@ class Metrics:
221
  return responses, answers
222
 
223
  def drop(responses: list[str], answers: list[list]):
224
- if len(responses) != len(answers):
225
- return {"error": "predictions and references have different " "length"}
226
- responses = [general_postprocess(pred) for pred in responses]
227
- processed_answers = [[general_postprocess(j) for j in i] for i in answers]
228
  scores = []
229
- for pred, ans in zip(responses, processed_answers):
230
  score = np.mean([1 if a in pred else 0 for a in ans])
231
  scores.append(score)
232
  return {"em": np.mean(scores)}
@@ -608,29 +602,14 @@ class DROP:
608
  input_column = "input"
609
  label_column = "answers"
610
 
611
- icl_prompt = """\
612
- Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
613
- Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
614
- Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
615
-
616
- Text: Playing in their second straight Thanksgiving game, the Eagles struggled especially on defense, where they were unable to stop the much-hyped Lions offense. The worst of it all was how unproven rookie Eric Rowe was tasked with covering wide receiver Calvin Johnson, leading to Johnson catching 3 touchdowns. Stafford’s five passing touchdowns, including three of them to Johnson was too much for the Eagles to overcome and for the second consecutive time this season, the Eagles gave up 45 points in a game. With the loss, the Eagles drop to 4-7 on the season and 6-1 when playing on Thanksgiving.
617
- Question: How many TD passes did Stafford throw other than to Johnson?
618
- Anawer: According to the text, Stafford threw 5 TD passes, 3 of which were to Johnson. 5-3=2. So the answer is 2.
619
-
620
- Text: [PROMPT]
621
- Question: [QUESTION]
622
- Anawer:"""
623
-
624
  @classmethod
625
  def prompt_drop(cls, example):
626
- prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
627
- "[QUESTION]", example["question"]
628
- )
629
 
630
- validated_answers = example["answers_spans"]["spans"]
631
- answers = list(set(validated_answers))
632
-
633
- return {cls.input_column: prompt, cls.label_column: answers}
634
 
635
  @classmethod
636
  def suite(
@@ -642,7 +621,8 @@ Anawer:"""
642
  input_column=cls.input_column,
643
  label_column=cls.label_column,
644
  prompt=partial(cls.prompt_drop),
645
- few_shot=0,
 
646
  split="validation",
647
  )
648
 
@@ -715,28 +695,17 @@ class ARC:
715
 
716
  @classmethod
717
  def suite(cls):
718
- finer_categories = (
719
- pd.Series(cls.categories) # noqa # type: ignore
720
- .explode()
721
- .reset_index()
722
- .set_index(0)
723
- .groupby(0)
724
- .agg(list)["index"]
725
- .to_dict()
726
- )
727
- suite = defaultdict(list)
728
- categories = list(finer_categories.keys())
729
- for cate in categories:
730
- suite[cate].append(
731
- Task(
732
- ("ai2_arc", cate),
733
- metric_name=("sustech/tlem", "arc"),
734
- input_column=cls.input_column,
735
- label_column=cls.label_column,
736
- prompt=partial(cls.prompt_arc),
737
- few_shot=0,
738
- )
739
  )
 
 
740
 
741
  return suite
742
 
 
124
  shots = shots.map(
125
  lambda example: {
126
  self.input_column: example[self.input_column]
127
+ + "\n"
128
  + example[self.label_column],
129
  }
130
  )[self.input_column]
 
194
  class Metrics:
195
  cmmlu = multichoice_zh
196
  mmlu = multichoice
197
+ ceval = multichoice_zh
 
 
 
198
 
199
  def winogrande(responses: list[str], answers: list[str | int]):
200
  responses = [first_option_postprocess(pred, options="AB") for pred in responses]
 
219
  return responses, answers
220
 
221
  def drop(responses: list[str], answers: list[list]):
 
 
 
 
222
  scores = []
223
+ for pred, ans in zip(responses, answers):
224
  score = np.mean([1 if a in pred else 0 for a in ans])
225
  scores.append(score)
226
  return {"em": np.mean(scores)}
 
602
  input_column = "input"
603
  label_column = "answers"
604
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  @classmethod
606
  def prompt_drop(cls, example):
607
+ prompt = f"Read the following passage and answer the question.\n\n{example['passage']}\n\nQuestion: {example['question']}"
 
 
608
 
609
+ return {
610
+ cls.input_column: prompt,
611
+ cls.label_column: ",".join(example["answers_spans"]["spans"]),
612
+ }
613
 
614
  @classmethod
615
  def suite(
 
621
  input_column=cls.input_column,
622
  label_column=cls.label_column,
623
  prompt=partial(cls.prompt_drop),
624
+ few_shot=3,
625
+ few_shot_from="train",
626
  split="validation",
627
  )
628
 
 
695
 
696
  @classmethod
697
  def suite(cls):
698
+ suite = [
699
+ Task(
700
+ ("ai2_arc", subset),
701
+ metric_name=("sustech/tlem", "arc"),
702
+ input_column=cls.input_column,
703
+ label_column=cls.label_column,
704
+ prompt=partial(cls.prompt_arc),
705
+ few_shot=0,
 
 
 
 
 
 
 
 
 
 
 
 
 
706
  )
707
+ for subset in cls.categories
708
+ ]
709
 
710
  return suite
711
 
tlem.py CHANGED
@@ -152,7 +152,7 @@ class Suite(EvaluationSuite):
152
  case "MATH" | "competition_math":
153
  suite = Task(
154
  dataset_name="hendrycks/competition_math",
155
- prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form",
156
  metric_name=("sustech/tlem", "MATH"),
157
  input_column="problem",
158
  label_column="solution",
@@ -170,7 +170,20 @@ class Suite(EvaluationSuite):
170
  "drop",
171
  ]:
172
  suite[name] = self.get_suite(name)
173
-
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  if isinstance(suite, Task):
175
  suite = [suite]
176
  if isinstance(suite, list):
@@ -187,10 +200,13 @@ class Suite(EvaluationSuite):
187
 
188
  def drop_duplicates(self, suite):
189
  for category, tasks in suite.items():
190
- if isinstance(tasks, dict):
191
- suite[category] = self.drop_duplicates(tasks)
192
- else:
193
- suite[category] = [self.singleton(task) for task in tasks]
 
 
 
194
  return suite
195
 
196
  def load(self, name):
 
152
  case "MATH" | "competition_math":
153
  suite = Task(
154
  dataset_name="hendrycks/competition_math",
155
+ prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form.",
156
  metric_name=("sustech/tlem", "MATH"),
157
  input_column="problem",
158
  label_column="solution",
 
170
  "drop",
171
  ]:
172
  suite[name] = self.get_suite(name)
173
+ case "tlem":
174
+ suite = {}
175
+ for name in [
176
+ "arc",
177
+ "hellaswag",
178
+ "mmlu-chat",
179
+ "winogrande",
180
+ "gsm8k",
181
+ "cmmlu-chat",
182
+ "ceval-chat",
183
+ # "truthful_qa",
184
+ "drop",
185
+ ]:
186
+ suite[name] = self.get_suite(name)
187
  if isinstance(suite, Task):
188
  suite = [suite]
189
  if isinstance(suite, list):
 
200
 
201
  def drop_duplicates(self, suite):
202
  for category, tasks in suite.items():
203
+ match tasks:
204
+ case list():
205
+ suite[category] = [self.singleton(task) for task in tasks]
206
+ case dict():
207
+ suite[category] = self.drop_duplicates(tasks)
208
+ case _:
209
+ raise NotImplementedError
210
  return suite
211
 
212
  def load(self, name):