Spaces:

SUSTech
/

tlem

Running

App Files Files Community

facat commited on Nov 27, 2023

Commit

d13c0d8

1 Parent(s): 25e4875

add math

Browse files

Files changed (2) hide show

tasks.py +268 -223
tlem.py +9 -0

tasks.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from dataclasses import dataclass, field
 from datasets import load_dataset, Dataset
 from functools import cached_property
 from tqdm.auto import tqdm
-from typing import Any, Optional, Protocol, Iterable, Callable
 import logging
 import pandas as pd
 from functools import partial
@@ -187,71 +188,57 @@ def multichoice_zh(responses: Any, references: list[str]):
 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
     def ceval(responses: list[str], answers: list[str | int]):
         responses = [extract_choice_zh(pred) for pred in responses]
         return responses, answers
     def winogrande(responses: list[str], answers: list[str | int]):
         responses = [first_option_postprocess(pred, options="AB") for pred in responses]
         return responses, answers
     def arc(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
-            return {
-                'error': 'predictions and references have different '
-                'length'
-            }
-        responses = [first_option_postprocess(pred, options="ABCD") for pred in responses]
         return responses, answers
     def hellaswag(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
-            return {
-                'error': 'predictions and references have different '
-                'length'
-            }
-        responses = [first_option_postprocess(pred, options="ABCD") for pred in responses]
-        answers = ['ABCD'[int(ans)] for ans in answers]
         return responses, answers
     def drop(responses: list[str], answers: list[list]):
         if len(responses) != len(answers):
-            return {
-                'error': 'predictions and references have different '
-                'length'
-            }
         responses = [general_postprocess(pred) for pred in responses]
-        processed_answers = [[general_postprocess(j) for j in i]
-                             for i in answers]
         matched_answers = []
-        for pred, ans, origin_ans in zip(responses, processed_answers,
-                                         answers):
             if pred in ans or pred in origin_ans:
                 matched_answers.append(pred)
             else:
                 matched_answers.append(ans[0])
         return responses, matched_answers
     def bbh_mcq(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
-            return {
-                'error': 'predictions and references have different '
-                'length'
-            }
         responses = [bbh_mcq_postprocess(pred) for pred in responses]
         return responses, answers
     def bbh_freefrom(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
-            return {
-                'error': 'predictions and references have different '
-                'length'
-            }
         responses = [bbh_freeform_postprocess(pred) for pred in responses]
@@ -272,27 +259,16 @@ class Metrics:
         return responses, answers
     def MATH(responses: list[str], answers: list[str]):
-        scores = []
-        for response, answer in zip(responses, answers):
             indices = [pos for pos, char in enumerate(response) if char == "$"]
             if len(indices) <= 2:
-                scores.append(0)
-                continue
             else:
-                result = response[indices[-2] + 1 : indices[-1]]
-                gold = get_answer(answer)
-                scores.append(1.0 * is_equiv(result, gold))
-        return scores
-    def math23k(responses: list[str], answers: list[str]):
-        scores = []
-        for response, answer in zip(responses, answers):
-            pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
-            gold = extract_numeric(answer, pattern=NUMERIC_IN_ZH)
-            scores.append(1.0 * (pred == gold))
-        return scores
 class CMMLU:
@@ -570,7 +546,7 @@ class MMLU:
 class Winogrande:
     input_column = "input"
     label_column = "answer"
     categories = [
         "winogrande_debiased",
         "winogrande_l",
@@ -579,24 +555,24 @@ class Winogrande:
         "winogrande_xl",
         "winogrande_xs",
     ]
     @classmethod
     def prompt_winogrande(cls, example):
-        option1 = example["sentence"].replace("_", example['option1'])
-        option2 = example["sentence"].replace("_", example['option2'])
         answer = example[cls.label_column]
         prompt = f"Which of the following is a good sentence:\nA. {option1}\nB. {option2}\nAnswer:"
         return {
             cls.input_column: prompt,
-            cls.label_column: ' AB'[int(answer)] if answer != '' else ''
         }
     @classmethod
-    def suite(cls,):
-        subcategories = {
-            item: [item] for item in cls.categories
-        }
         finer_categories = (
             pd.Series(subcategories)  # noqa # type: ignore
             .explode()
@@ -618,19 +594,18 @@ class Winogrande:
                         label_column=cls.label_column,
                         prompt=partial(cls.prompt_winogrande),
                         few_shot=0,
-                        split="validation"
                     )
                 )
         return suite
 class DROP:
     input_column = "input"
     label_column = "answers"
-    icl_prompt = '''\
 Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
 Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
 Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
@@ -641,15 +616,16 @@ Anawer: According to the text, Stafford threw 5 TD passes, 3 of which were to Jo
 Text: [PROMPT]
 Question: [QUESTION]
-Anawer:'''
     categories = ["validation"]
     @classmethod
     def prompt_drop(cls, example):
-        prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace("[QUESTION]", example["question"])
         validated_answers = example["answers_spans"]["spans"]
         validated_types = example["answers_spans"]["types"]
         answers = []
@@ -661,18 +637,16 @@ Anawer:'''
             #     answers.append(' '.join(d).strip())
             # else:
             #     for span in answer_item['spans']:
-                    # answers.append(span)
             answers.append(answer_item)
         answers = list(set(answers))
-        return {
-            cls.input_column: prompt,
-            cls.label_column: answers
-        }
     @classmethod
-    def suite(cls,):
         finer_categories = (
             pd.Series(cls.categories)  # noqa # type: ignore
             .explode()
@@ -693,33 +667,34 @@ Anawer:'''
                     label_column=cls.label_column,
                     prompt=partial(cls.prompt_drop),
                     few_shot=0,
-                    split="validation"
                 )
             )
         return suite
 class HellaSwag:
     input_column = "input"
     label_column = "label"
     categories = ["validation"]
     @classmethod
     def prompt_hellaswag(cls, example):
         prompt = f"{example['ctx']}\nQuestion: Which ending makes the most sense?\n"
         prompt += f"A. {example['endings'][0]}\n"
         prompt += f"B. {example['endings'][1]}\n"
         prompt += f"C. {example['endings'][2]}\n"
         prompt += f"D. {example['endings'][3]}\n"
         prompt += "You may choose from 'A', 'B', 'C', 'D'.\nAnswer:"
         return {cls.input_column: prompt}
     @classmethod
-    def suite(cls,):
         finer_categories = (
             pd.Series(cls.categories)  # noqa # type: ignore
             .explode()
@@ -740,21 +715,22 @@ class HellaSwag:
                     label_column=cls.label_column,
                     prompt=partial(cls.prompt_hellaswag),
                     few_shot=0,
-                    split="validation"
                 )
             )
         return suite
 class ARC:
     input_column = "input"
     label_column = "answerKey"
     categories = [
         "ARC-Challenge",
         "ARC-Easy",
     ]
     @classmethod
     def prompt_arc(cls, example):
         choices = example["choices"]
@@ -762,10 +738,8 @@ class ARC:
         for label, choice in zip(choices["label"], choices["text"]):
             prompt += f"\n{label}. {choice}"
         prompt += "\nAnswer:"
-        return {
-            cls.input_column: prompt
-        }
     @classmethod
     def suite(cls):
         finer_categories = (
@@ -790,62 +764,71 @@ class ARC:
                     few_shot=0,
                 )
             )
         return suite
 class BBH:
     input_column = "input"
     label_column = "target"
     multiple_choice_prefix = "Follow the given examples and answer the question.\n[HINT]\n\nQ: [INPUT]\nA: Let's think step by step."
     free_form_prefix = "Follow the given examples and answer the question.\n[HINT]\n\nQ: [INPUT]\nA: Let's think step by step."
     bbh_multiple_choice_sets = [
-        'temporal_sequences',
-        'disambiguation_qa',
-        'date_understanding',
-        'tracking_shuffled_objects_three_objects',
-        'penguins_in_a_table',
-        'geometric_shapes',
-        'snarks',
-        'ruin_names',
-        'tracking_shuffled_objects_seven_objects',
-        'tracking_shuffled_objects_five_objects',
-        'logical_deduction_three_objects',
-        'hyperbaton',
-        'logical_deduction_five_objects',
-        'logical_deduction_seven_objects',
-        'movie_recommendation',
-        'salient_translation_error_detection',
-        'reasoning_about_colored_objects',
     ]
     bbh_free_form_sets = [
-        'multistep_arithmetic_two',
-        'navigate',
-        'dyck_languages',
-        'word_sorting',
-        'sports_understanding',
-        'boolean_expressions',
-        'object_counting',
-        'formal_fallacies',
-        'causal_judgement',
-        'web_of_lies',
     ]
     @classmethod
-    def prompt_bbh(cls, example, category:str):
-        meta_prompt = cls.multiple_choice_prefix if category in cls.bbh_multiple_choice_sets else cls.free_form_prefix
-        prompt = meta_prompt.replace("[HINT]", bbh_lib_prompt(category=category)).replace("[INPUT]", example[cls.input_column])
         return {"input": prompt}
     @classmethod
-    def suite(cls,):
         finer_categories = (
-            pd.Series(cls.bbh_free_form_sets + cls.bbh_multiple_choice_sets)  # noqa # type: ignore
             .explode()
             .reset_index()
             .set_index(0)
@@ -878,167 +861,229 @@ class BBH:
                         few_shot=0,
                     )
                 )
         return suite
 class CEVAL:
     input_column = "input"
     label_column = "answer"
     @classmethod
-    def prompt_ceval(cls, example, cate:str, chat=False):
         _ch_name = cls.ceval_subject_mapping[cate][1]
-        prefix = (
-            f"以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案��\n"
-            if chat
-            else "问题："
-        )
         prompt = prefix + f'{example["question"]}'
         for choice in list("ABCD"):
             prompt += f"\n{choice}. {example[choice]}"
         prompt += "\n答案："
         return {"input": prompt}
     ceval_subject_mapping = {
-        "computer_network":
-        ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
-        "operating_system":
-        ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
-        "computer_architecture":
-        ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
-        "college_programming":
-        ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
         "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
-        "college_chemistry":
-        ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
-        "advanced_mathematics":
-        ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
-        "probability_and_statistics":
-        ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
-        "discrete_mathematics":
-        ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
         "electrical_engineer": [
-            "Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
-            "STEM"
         ],
-        "metrology_engineer":
-        ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
-        "high_school_mathematics":
-        ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
-        "high_school_physics":
-        ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
-        "high_school_chemistry":
-        ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
         "high_school_biology": [
-            "High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"
         ],
         "middle_school_mathematics": [
-            "Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"
         ],
         "middle_school_biology": [
-            "Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"
         ],
         "middle_school_physics": [
-            "Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"
         ],
         "middle_school_chemistry": [
-            "Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"
-        ],
-        "veterinary_medicine": [
-            "Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"
         ],
         "college_economics": [
-            "College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"
         ],
         "business_administration": [
-            "Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"
         ],
         "marxism": [
-            "Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
-            "Social Science"
         ],
         "mao_zedong_thought": [
             "Mao Zedong Thought",
             "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
-            "Social Science"
         ],
         "education_science": [
-            "Education Science", "\u6559\u80b2\u5b66", "Social Science"
         ],
         "teacher_qualification": [
-            "Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"
         ],
         "high_school_politics": [
-            "High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"
         ],
         "high_school_geography": [
-            "High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"
         ],
         "middle_school_politics": [
-            "Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"
         ],
         "middle_school_geography": [
-            "Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"
         ],
-        "modern_chinese_history":
-        ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
         "ideological_and_moral_cultivation": [
             "Ideological and Moral Cultivation",
             "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
-            "Humanities"
         ],
         "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
         "law": ["Law", "\u6cd5\u5b66", "Humanities"],
         "chinese_language_and_literature": [
             "Chinese Language and Literature",
-            "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"
         ],
         "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
         "professional_tour_guide": [
-            "Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"
         ],
         "legal_professional": [
-            "Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
-            "Humanities"
         ],
         "high_school_chinese": [
-            "High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"
         ],
         "high_school_history": [
-            "High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"
         ],
         "middle_school_history": [
-            "Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"
         ],
         "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
         "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
-        "plant_protection": [
-            "Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"
-        ],
         "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
-        "clinical_medicine": [
-            "Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"
-        ],
         "urban_and_rural_planner": [
             "Urban and Rural Planner",
-            "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"
         ],
         "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
         "fire_engineer": [
-            "Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"
         ],
         "environmental_impact_assessment_engineer": [
             "Environmental Impact Assessment Engineer",
-            "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"
         ],
         "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
-        "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
     }
     @classmethod
     def suite(cls, chat: bool):
         suite = defaultdict(list)
@@ -1058,8 +1103,8 @@ class CEVAL:
                         prompt=partial(cls.prompt_ceval, cate=subject, chat=chat),
                         few_shot=0 if chat else 5,
                         few_shot_from="dev",
-                        split="val"
                     )
                 )
-        return suite

 from dataclasses import dataclass, field
 from datasets import load_dataset, Dataset
 from functools import cached_property
 from tqdm.auto import tqdm
+from typing import Any, Optional, Callable
 import logging
 import pandas as pd
 from functools import partial
 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
     def ceval(responses: list[str], answers: list[str | int]):
         responses = [extract_choice_zh(pred) for pred in responses]
         return responses, answers
     def winogrande(responses: list[str], answers: list[str | int]):
         responses = [first_option_postprocess(pred, options="AB") for pred in responses]
         return responses, answers
     def arc(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
+            return {"error": "predictions and references have different " "length"}
+        responses = [
+            first_option_postprocess(pred, options="ABCD") for pred in responses
+        ]
         return responses, answers
     def hellaswag(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
+            return {"error": "predictions and references have different " "length"}
+        responses = [
+            first_option_postprocess(pred, options="ABCD") for pred in responses
+        ]
+        answers = ["ABCD"[int(ans)] for ans in answers]
         return responses, answers
     def drop(responses: list[str], answers: list[list]):
         if len(responses) != len(answers):
+            return {"error": "predictions and references have different " "length"}
         responses = [general_postprocess(pred) for pred in responses]
+        processed_answers = [[general_postprocess(j) for j in i] for i in answers]
         matched_answers = []
+        for pred, ans, origin_ans in zip(responses, processed_answers, answers):
             if pred in ans or pred in origin_ans:
                 matched_answers.append(pred)
             else:
                 matched_answers.append(ans[0])
         return responses, matched_answers
     def bbh_mcq(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
+            return {"error": "predictions and references have different " "length"}
         responses = [bbh_mcq_postprocess(pred) for pred in responses]
         return responses, answers
     def bbh_freefrom(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
+            return {"error": "predictions and references have different " "length"}
         responses = [bbh_freeform_postprocess(pred) for pred in responses]
         return responses, answers
     def MATH(responses: list[str], answers: list[str]):
+        extract_responses = []
+        for response in responses:
             indices = [pos for pos, char in enumerate(response) if char == "$"]
             if len(indices) <= 2:
+                ans = ""
             else:
+                ans = response[indices[-2] + 1 : indices[-1]]
+            extract_responses.append(strip_string(ans))
+        extract_answers = [strip_string(get_answer(answer)) for answer in answers]
+        return extract_responses, extract_answers
 class CMMLU:
 class Winogrande:
     input_column = "input"
     label_column = "answer"
     categories = [
         "winogrande_debiased",
         "winogrande_l",
         "winogrande_xl",
         "winogrande_xs",
     ]
     @classmethod
     def prompt_winogrande(cls, example):
+        option1 = example["sentence"].replace("_", example["option1"])
+        option2 = example["sentence"].replace("_", example["option2"])
         answer = example[cls.label_column]
         prompt = f"Which of the following is a good sentence:\nA. {option1}\nB. {option2}\nAnswer:"
         return {
             cls.input_column: prompt,
+            cls.label_column: " AB"[int(answer)] if answer != "" else "",
         }
     @classmethod
+    def suite(
+        cls,
+    ):
+        subcategories = {item: [item] for item in cls.categories}
         finer_categories = (
             pd.Series(subcategories)  # noqa # type: ignore
             .explode()
                         label_column=cls.label_column,
                         prompt=partial(cls.prompt_winogrande),
                         few_shot=0,
+                        split="validation",
                     )
                 )
         return suite
 class DROP:
     input_column = "input"
     label_column = "answers"
+    icl_prompt = """\
 Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
 Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
 Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
 Text: [PROMPT]
 Question: [QUESTION]
+Anawer:"""
     categories = ["validation"]
     @classmethod
     def prompt_drop(cls, example):
+        prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
+            "[QUESTION]", example["question"]
+        )
         validated_answers = example["answers_spans"]["spans"]
         validated_types = example["answers_spans"]["types"]
         answers = []
             #     answers.append(' '.join(d).strip())
             # else:
             #     for span in answer_item['spans']:
+            # answers.append(span)
             answers.append(answer_item)
         answers = list(set(answers))
+        return {cls.input_column: prompt, cls.label_column: answers}
     @classmethod
+    def suite(
+        cls,
+    ):
         finer_categories = (
             pd.Series(cls.categories)  # noqa # type: ignore
             .explode()
                     label_column=cls.label_column,
                     prompt=partial(cls.prompt_drop),
                     few_shot=0,
+                    split="validation",
                 )
             )
         return suite
 class HellaSwag:
     input_column = "input"
     label_column = "label"
     categories = ["validation"]
     @classmethod
     def prompt_hellaswag(cls, example):
         prompt = f"{example['ctx']}\nQuestion: Which ending makes the most sense?\n"
         prompt += f"A. {example['endings'][0]}\n"
         prompt += f"B. {example['endings'][1]}\n"
         prompt += f"C. {example['endings'][2]}\n"
         prompt += f"D. {example['endings'][3]}\n"
         prompt += "You may choose from 'A', 'B', 'C', 'D'.\nAnswer:"
         return {cls.input_column: prompt}
     @classmethod
+    def suite(
+        cls,
+    ):
         finer_categories = (
             pd.Series(cls.categories)  # noqa # type: ignore
             .explode()
                     label_column=cls.label_column,
                     prompt=partial(cls.prompt_hellaswag),
                     few_shot=0,
+                    split="validation",
                 )
             )
         return suite
 class ARC:
     input_column = "input"
     label_column = "answerKey"
     categories = [
         "ARC-Challenge",
         "ARC-Easy",
     ]
     @classmethod
     def prompt_arc(cls, example):
         choices = example["choices"]
         for label, choice in zip(choices["label"], choices["text"]):
             prompt += f"\n{label}. {choice}"
         prompt += "\nAnswer:"
+        return {cls.input_column: prompt}
     @classmethod
     def suite(cls):
         finer_categories = (
                     few_shot=0,
                 )
             )
         return suite
 class BBH:
     input_column = "input"
     label_column = "target"
     multiple_choice_prefix = "Follow the given examples and answer the question.\n[HINT]\n\nQ: [INPUT]\nA: Let's think step by step."
     free_form_prefix = "Follow the given examples and answer the question.\n[HINT]\n\nQ: [INPUT]\nA: Let's think step by step."
     bbh_multiple_choice_sets = [
+        "temporal_sequences",
+        "disambiguation_qa",
+        "date_understanding",
+        "tracking_shuffled_objects_three_objects",
+        "penguins_in_a_table",
+        "geometric_shapes",
+        "snarks",
+        "ruin_names",
+        "tracking_shuffled_objects_seven_objects",
+        "tracking_shuffled_objects_five_objects",
+        "logical_deduction_three_objects",
+        "hyperbaton",
+        "logical_deduction_five_objects",
+        "logical_deduction_seven_objects",
+        "movie_recommendation",
+        "salient_translation_error_detection",
+        "reasoning_about_colored_objects",
     ]
     bbh_free_form_sets = [
+        "multistep_arithmetic_two",
+        "navigate",
+        "dyck_languages",
+        "word_sorting",
+        "sports_understanding",
+        "boolean_expressions",
+        "object_counting",
+        "formal_fallacies",
+        "causal_judgement",
+        "web_of_lies",
     ]
     @classmethod
+    def prompt_bbh(cls, example, category: str):
+        meta_prompt = (
+            cls.multiple_choice_prefix
+            if category in cls.bbh_multiple_choice_sets
+            else cls.free_form_prefix
+        )
+        prompt = meta_prompt.replace(
+            "[HINT]", bbh_lib_prompt(category=category)
+        ).replace("[INPUT]", example[cls.input_column])
         return {"input": prompt}
     @classmethod
+    def suite(
+        cls,
+    ):
         finer_categories = (
+            pd.Series(
+                cls.bbh_free_form_sets + cls.bbh_multiple_choice_sets
+            )  # noqa # type: ignore
             .explode()
             .reset_index()
             .set_index(0)
                         few_shot=0,
                     )
                 )
         return suite
 class CEVAL:
     input_column = "input"
     label_column = "answer"
     @classmethod
+    def prompt_ceval(cls, example, cate: str, chat=False):
         _ch_name = cls.ceval_subject_mapping[cate][1]
+        prefix = f"以下是中国关于{_ch_name}考试的单项选择题，请选出其中的正确答案。\n" if chat else "问题："
         prompt = prefix + f'{example["question"]}'
         for choice in list("ABCD"):
             prompt += f"\n{choice}. {example[choice]}"
         prompt += "\n答案："
         return {"input": prompt}
     ceval_subject_mapping = {
+        "computer_network": [
+            "Computer Network",
+            "\u8ba1\u7b97\u673a\u7f51\u7edc",
+            "STEM",
+        ],
+        "operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
+        "computer_architecture": [
+            "Computer Architecture",
+            "\u8ba1\u7b97\u673a\u7ec4\u6210",
+            "STEM",
+        ],
+        "college_programming": [
+            "College Programming",
+            "\u5927\u5b66\u7f16\u7a0b",
+            "STEM",
+        ],
         "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
+        "college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
+        "advanced_mathematics": [
+            "Advanced Mathematics",
+            "\u9ad8\u7b49\u6570\u5b66",
+            "STEM",
+        ],
+        "probability_and_statistics": [
+            "Probability and Statistics",
+            "\u6982\u7387\u7edf\u8ba1",
+            "STEM",
+        ],
+        "discrete_mathematics": [
+            "Discrete Mathematics",
+            "\u79bb\u6563\u6570\u5b66",
+            "STEM",
+        ],
         "electrical_engineer": [
+            "Electrical Engineer",
+            "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
+            "STEM",
+        ],
+        "metrology_engineer": [
+            "Metrology Engineer",
+            "\u6ce8\u518c\u8ba1\u91cf\u5e08",
+            "STEM",
+        ],
+        "high_school_mathematics": [
+            "High School Mathematics",
+            "\u9ad8\u4e2d\u6570\u5b66",
+            "STEM",
+        ],
+        "high_school_physics": [
+            "High School Physics",
+            "\u9ad8\u4e2d\u7269\u7406",
+            "STEM",
+        ],
+        "high_school_chemistry": [
+            "High School Chemistry",
+            "\u9ad8\u4e2d\u5316\u5b66",
+            "STEM",
         ],
         "high_school_biology": [
+            "High School Biology",
+            "\u9ad8\u4e2d\u751f\u7269",
+            "STEM",
         ],
         "middle_school_mathematics": [
+            "Middle School Mathematics",
+            "\u521d\u4e2d\u6570\u5b66",
+            "STEM",
         ],
         "middle_school_biology": [
+            "Middle School Biology",
+            "\u521d\u4e2d\u751f\u7269",
+            "STEM",
         ],
         "middle_school_physics": [
+            "Middle School Physics",
+            "\u521d\u4e2d\u7269\u7406",
+            "STEM",
         ],
         "middle_school_chemistry": [
+            "Middle School Chemistry",
+            "\u521d\u4e2d\u5316\u5b66",
+            "STEM",
         ],
+        "veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"],
         "college_economics": [
+            "College Economics",
+            "\u5927\u5b66\u7ecf\u6d4e\u5b66",
+            "Social Science",
         ],
         "business_administration": [
+            "Business Administration",
+            "\u5de5\u5546\u7ba1\u7406",
+            "Social Science",
         ],
         "marxism": [
+            "Marxism",
+            "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
+            "Social Science",
         ],
         "mao_zedong_thought": [
             "Mao Zedong Thought",
             "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
+            "Social Science",
         ],
         "education_science": [
+            "Education Science",
+            "\u6559\u80b2\u5b66",
+            "Social Science",
         ],
         "teacher_qualification": [
+            "Teacher Qualification",
+            "\u6559\u5e08\u8d44\u683c",
+            "Social Science",
         ],
         "high_school_politics": [
+            "High School Politics",
+            "\u9ad8\u4e2d\u653f\u6cbb",
+            "Social Science",
         ],
         "high_school_geography": [
+            "High School Geography",
+            "\u9ad8\u4e2d\u5730\u7406",
+            "Social Science",
         ],
         "middle_school_politics": [
+            "Middle School Politics",
+            "\u521d\u4e2d\u653f\u6cbb",
+            "Social Science",
         ],
         "middle_school_geography": [
+            "Middle School Geography",
+            "\u521d\u4e2d\u5730\u7406",
+            "Social Science",
+        ],
+        "modern_chinese_history": [
+            "Modern Chinese History",
+            "\u8fd1\u4ee3\u53f2\u7eb2\u8981",
+            "Humanities",
         ],
         "ideological_and_moral_cultivation": [
             "Ideological and Moral Cultivation",
             "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
+            "Humanities",
         ],
         "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
         "law": ["Law", "\u6cd5\u5b66", "Humanities"],
         "chinese_language_and_literature": [
             "Chinese Language and Literature",
+            "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66",
+            "Humanities",
         ],
         "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
         "professional_tour_guide": [
+            "Professional Tour Guide",
+            "\u5bfc\u6e38\u8d44\u683c",
+            "Humanities",
         ],
         "legal_professional": [
+            "Legal Professional",
+            "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
+            "Humanities",
         ],
         "high_school_chinese": [
+            "High School Chinese",
+            "\u9ad8\u4e2d\u8bed\u6587",
+            "Humanities",
         ],
         "high_school_history": [
+            "High School History",
+            "\u9ad8\u4e2d\u5386\u53f2",
+            "Humanities",
         ],
         "middle_school_history": [
+            "Middle School History",
+            "\u521d\u4e2d\u5386\u53f2",
+            "Humanities",
         ],
         "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
         "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
+        "plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"],
         "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
+        "clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"],
         "urban_and_rural_planner": [
             "Urban and Rural Planner",
+            "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08",
+            "Other",
         ],
         "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
         "fire_engineer": [
+            "Fire Engineer",
+            "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08",
+            "Other",
         ],
         "environmental_impact_assessment_engineer": [
             "Environmental Impact Assessment Engineer",
+            "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08",
+            "Other",
         ],
         "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
+        "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"],
     }
     @classmethod
     def suite(cls, chat: bool):
         suite = defaultdict(list)
                         prompt=partial(cls.prompt_ceval, cate=subject, chat=chat),
                         few_shot=0 if chat else 5,
                         few_shot_from="dev",
+                        split="val",
                     )
                 )
+        return suite

tlem.py CHANGED Viewed

@@ -135,6 +135,15 @@ class Suite(EvaluationSuite):
                     prompt=mt_bench_prompt
                     # metric_name=("sustech/tlem", "gsm8k"),
                 )
         match name:
             case _ if "test" in name:
                 suite = suite["Test"]

                     prompt=mt_bench_prompt
                     # metric_name=("sustech/tlem", "gsm8k"),
                 )
+            case "MATH" | "competition_math":
+                suite = Task(
+                    dataset_name="hendrycks/competition_math",
+                    split="test",
+                    prompt="This is a math problem, please think step by step and slove it: {input_column}",
+                    metric_name=("sustech/tlem", "MATH"),
+                    input_column="problem",
+                    label_column="solution",
+                )
         match name:
             case _ if "test" in name:
                 suite = suite["Test"]