Spaces:

SUSTech
/

tlem

Running

App Files Files Community

facat commited on Nov 30, 2023

Commit

5ca9a91

1 Parent(s): 132574a

update drop

Browse files

Files changed (2) hide show

tasks.py +20 -51
tlem.py +22 -6

tasks.py CHANGED Viewed

@@ -124,6 +124,7 @@ class Task:
             shots = shots.map(
                 lambda example: {
                     self.input_column: example[self.input_column]
                     + example[self.label_column],
                 }
             )[self.input_column]
@@ -193,10 +194,7 @@ def multichoice_zh(responses: Any, references: list[str]):
 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
-    def ceval(responses: list[str], answers: list[str | int]):
-        responses = [extract_choice_zh(pred) for pred in responses]
-        return responses, answers
     def winogrande(responses: list[str], answers: list[str | int]):
         responses = [first_option_postprocess(pred, options="AB") for pred in responses]
@@ -221,12 +219,8 @@ class Metrics:
         return responses, answers
     def drop(responses: list[str], answers: list[list]):
-        if len(responses) != len(answers):
-            return {"error": "predictions and references have different " "length"}
-        responses = [general_postprocess(pred) for pred in responses]
-        processed_answers = [[general_postprocess(j) for j in i] for i in answers]
         scores = []
-        for pred, ans in zip(responses, processed_answers):
             score = np.mean([1 if a in pred else 0 for a in ans])
             scores.append(score)
         return {"em": np.mean(scores)}
@@ -608,29 +602,14 @@ class DROP:
     input_column = "input"
     label_column = "answers"
-    icl_prompt = """\
-Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
-Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
-Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
-Text: Playing in their second straight Thanksgiving game, the Eagles struggled especially on defense, where they were unable to stop the much-hyped Lions offense. The worst of it all was how unproven rookie Eric Rowe was tasked with covering wide receiver Calvin Johnson, leading to Johnson catching 3 touchdowns. Stafford’s five passing touchdowns, including three of them to Johnson was too much for the Eagles to overcome and for the second consecutive time this season, the Eagles gave up 45 points in a game. With the loss, the Eagles drop to 4-7 on the season and 6-1 when playing on Thanksgiving.
-Question: How many TD passes did Stafford throw other than to Johnson?
-Anawer: According to the text, Stafford threw 5 TD passes, 3 of which were to Johnson. 5-3=2. So the answer is 2.
-Text: [PROMPT]
-Question: [QUESTION]
-Anawer:"""
     @classmethod
     def prompt_drop(cls, example):
-        prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
-            "[QUESTION]", example["question"]
-        )
-        validated_answers = example["answers_spans"]["spans"]
-        answers = list(set(validated_answers))
-        return {cls.input_column: prompt, cls.label_column: answers}
     @classmethod
     def suite(
@@ -642,7 +621,8 @@ Anawer:"""
             input_column=cls.input_column,
             label_column=cls.label_column,
             prompt=partial(cls.prompt_drop),
-            few_shot=0,
             split="validation",
         )
@@ -715,28 +695,17 @@ class ARC:
     @classmethod
     def suite(cls):
-        finer_categories = (
-            pd.Series(cls.categories)  # noqa # type: ignore
-            .explode()
-            .reset_index()
-            .set_index(0)
-            .groupby(0)
-            .agg(list)["index"]
-            .to_dict()
-        )
-        suite = defaultdict(list)
-        categories = list(finer_categories.keys())
-        for cate in categories:
-            suite[cate].append(
-                Task(
-                    ("ai2_arc", cate),
-                    metric_name=("sustech/tlem", "arc"),
-                    input_column=cls.input_column,
-                    label_column=cls.label_column,
-                    prompt=partial(cls.prompt_arc),
-                    few_shot=0,
-                )
             )
         return suite

             shots = shots.map(
                 lambda example: {
                     self.input_column: example[self.input_column]
+                    + "\n"
                     + example[self.label_column],
                 }
             )[self.input_column]
 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
+    ceval = multichoice_zh
     def winogrande(responses: list[str], answers: list[str | int]):
         responses = [first_option_postprocess(pred, options="AB") for pred in responses]
         return responses, answers
     def drop(responses: list[str], answers: list[list]):
         scores = []
+        for pred, ans in zip(responses, answers):
             score = np.mean([1 if a in pred else 0 for a in ans])
             scores.append(score)
         return {"em": np.mean(scores)}
     input_column = "input"
     label_column = "answers"
     @classmethod
     def prompt_drop(cls, example):
+        prompt = f"Read the following passage and answer the question.\n\n{example['passage']}\n\nQuestion: {example['question']}"
+        return {
+            cls.input_column: prompt,
+            cls.label_column: ",".join(example["answers_spans"]["spans"]),
+        }
     @classmethod
     def suite(
             input_column=cls.input_column,
             label_column=cls.label_column,
             prompt=partial(cls.prompt_drop),
+            few_shot=3,
+            few_shot_from="train",
             split="validation",
         )
     @classmethod
     def suite(cls):
+        suite = [
+            Task(
+                ("ai2_arc", subset),
+                metric_name=("sustech/tlem", "arc"),
+                input_column=cls.input_column,
+                label_column=cls.label_column,
+                prompt=partial(cls.prompt_arc),
+                few_shot=0,
             )
+            for subset in cls.categories
+        ]
         return suite

tlem.py CHANGED Viewed

@@ -152,7 +152,7 @@ class Suite(EvaluationSuite):
             case "MATH" | "competition_math":
                 suite = Task(
                     dataset_name="hendrycks/competition_math",
-                    prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form",
                     metric_name=("sustech/tlem", "MATH"),
                     input_column="problem",
                     label_column="solution",
@@ -170,7 +170,20 @@ class Suite(EvaluationSuite):
                     "drop",
                 ]:
                     suite[name] = self.get_suite(name)
         if isinstance(suite, Task):
             suite = [suite]
         if isinstance(suite, list):
@@ -187,10 +200,13 @@ class Suite(EvaluationSuite):
     def drop_duplicates(self, suite):
         for category, tasks in suite.items():
-            if isinstance(tasks, dict):
-                suite[category] = self.drop_duplicates(tasks)
-            else:
-                suite[category] = [self.singleton(task) for task in tasks]
         return suite
     def load(self, name):

             case "MATH" | "competition_math":
                 suite = Task(
                     dataset_name="hendrycks/competition_math",
+                    prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form.",
                     metric_name=("sustech/tlem", "MATH"),
                     input_column="problem",
                     label_column="solution",
                     "drop",
                 ]:
                     suite[name] = self.get_suite(name)
+            case "tlem":
+                suite = {}
+                for name in [
+                    "arc",
+                    "hellaswag",
+                    "mmlu-chat",
+                    "winogrande",
+                    "gsm8k",
+                    "cmmlu-chat",
+                    "ceval-chat",
+                    # "truthful_qa",
+                    "drop",
+                ]:
+                    suite[name] = self.get_suite(name)
         if isinstance(suite, Task):
             suite = [suite]
         if isinstance(suite, list):
     def drop_duplicates(self, suite):
         for category, tasks in suite.items():
+            match tasks:
+                case list():
+                    suite[category] = [self.singleton(task) for task in tasks]
+                case dict():
+                    suite[category] = self.drop_duplicates(tasks)
+                case _:
+                    raise NotImplementedError
         return suite
     def load(self, name):