refactor: mmlu parser
Browse files- llmdataparser/mmlu_parser.py +299 -1
- tests/test_mmlu_parser.py +94 -0
llmdataparser/mmlu_parser.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from typing import Any, Final
|
3 |
|
4 |
-
from llmdataparser.base_parser import
|
|
|
|
|
|
|
|
|
|
|
5 |
from llmdataparser.prompts import MMLU_PRO_SYSTEM_PROMPT, MMLU_SYSTEM_PROMPT
|
6 |
|
7 |
MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
@@ -200,6 +205,86 @@ class BaseMMLUDatasetParser(MMLUDatasetParser):
|
|
200 |
"world_religions",
|
201 |
]
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
class MMLUReduxDatasetParser(MMLUDatasetParser):
|
205 |
"""Parser for the MMLU Redux dataset."""
|
@@ -239,6 +324,75 @@ class MMLUReduxDatasetParser(MMLUDatasetParser):
|
|
239 |
"virology",
|
240 |
]
|
241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
244 |
"""Parser for the TMMLU+ dataset."""
|
@@ -333,6 +487,65 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
|
333 |
prompt, raw_answer, raw_question, raw_choices, raw_answer, task
|
334 |
)
|
335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
|
337 |
class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
338 |
"""Parser for the MMLU Pro dataset."""
|
@@ -401,6 +614,91 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
|
401 |
prompt, answer_letter, raw_question, raw_choices, raw_answer, final_task
|
402 |
)
|
403 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
|
405 |
if __name__ == "__main__":
|
406 |
# Example usage of MMLU Pro parser
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from typing import Any, Final
|
3 |
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
from llmdataparser.prompts import MMLU_PRO_SYSTEM_PROMPT, MMLU_SYSTEM_PROMPT
|
11 |
|
12 |
MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
|
|
205 |
"world_religions",
|
206 |
]
|
207 |
|
208 |
+
def get_dataset_description(self) -> DatasetDescription:
|
209 |
+
"""Returns a description of the MMLU dataset."""
|
210 |
+
return DatasetDescription.create(
|
211 |
+
name="Massive Multitask Language Understanding (MMLU)",
|
212 |
+
purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
|
213 |
+
source="https://huggingface.co/datasets/cais/mmlu",
|
214 |
+
language="English",
|
215 |
+
format="Multiple choice questions with four options (A, B, C, D)",
|
216 |
+
characteristics=(
|
217 |
+
"Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
|
218 |
+
"and other essential areas of knowledge. The test includes 57 subjects such as "
|
219 |
+
"elementary mathematics, US history, computer science, and law. Success on this test "
|
220 |
+
"requires both extensive world knowledge and strong problem-solving capabilities."
|
221 |
+
),
|
222 |
+
citation="""@article{hendryckstest2021,
|
223 |
+
title={Measuring Massive Multitask Language Understanding},
|
224 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
225 |
+
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
226 |
+
year={2021}
|
227 |
+
}
|
228 |
+
@article{hendrycks2021ethics,
|
229 |
+
title={Aligning AI With Shared Human Values},
|
230 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
|
231 |
+
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
232 |
+
year={2021}
|
233 |
+
}""",
|
234 |
+
additional_info={
|
235 |
+
"subjects": "57 tasks/subjects",
|
236 |
+
"categories": [
|
237 |
+
"Humanities",
|
238 |
+
"Social Sciences",
|
239 |
+
"Hard Sciences",
|
240 |
+
"Other",
|
241 |
+
],
|
242 |
+
"example_subjects": [
|
243 |
+
"Elementary Mathematics",
|
244 |
+
"US History",
|
245 |
+
"Computer Science",
|
246 |
+
"Law",
|
247 |
+
],
|
248 |
+
"requirements": [
|
249 |
+
"Extensive world knowledge",
|
250 |
+
"Problem solving ability",
|
251 |
+
],
|
252 |
+
},
|
253 |
+
)
|
254 |
+
|
255 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
256 |
+
"""Returns the recommended evaluation metrics for MMLU dataset."""
|
257 |
+
return [
|
258 |
+
EvaluationMetric.create(
|
259 |
+
name="accuracy",
|
260 |
+
type="classification",
|
261 |
+
description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
|
262 |
+
implementation="evaluate.load('accuracy')",
|
263 |
+
primary=True,
|
264 |
+
),
|
265 |
+
EvaluationMetric.create(
|
266 |
+
name="subject_accuracy",
|
267 |
+
type="classification",
|
268 |
+
description="Per-subject accuracy scores across all 57 tasks",
|
269 |
+
implementation="custom_subject_accuracy",
|
270 |
+
primary=True,
|
271 |
+
),
|
272 |
+
EvaluationMetric.create(
|
273 |
+
name="category_accuracy",
|
274 |
+
type="classification",
|
275 |
+
description="Accuracy grouped by major categories (Humanities, Social Sciences, Hard Sciences, Other)",
|
276 |
+
implementation="custom_category_accuracy",
|
277 |
+
primary=True,
|
278 |
+
),
|
279 |
+
EvaluationMetric.create(
|
280 |
+
name="task_correlation",
|
281 |
+
type="analysis",
|
282 |
+
description="Analysis of performance correlations between different subjects/tasks",
|
283 |
+
implementation="custom_task_correlation",
|
284 |
+
primary=False,
|
285 |
+
),
|
286 |
+
]
|
287 |
+
|
288 |
|
289 |
class MMLUReduxDatasetParser(MMLUDatasetParser):
|
290 |
"""Parser for the MMLU Redux dataset."""
|
|
|
324 |
"virology",
|
325 |
]
|
326 |
|
327 |
+
def get_dataset_description(self) -> DatasetDescription:
|
328 |
+
"""Returns description of the MMLU Redux dataset."""
|
329 |
+
return DatasetDescription.create(
|
330 |
+
name="MMLU Redux",
|
331 |
+
purpose="Provide a manually re-annotated subset of MMLU with error analysis and corrections",
|
332 |
+
source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
|
333 |
+
language="English",
|
334 |
+
format="Multiple choice questions with four options (A, B, C, D)",
|
335 |
+
characteristics=(
|
336 |
+
"A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
|
337 |
+
"manually re-annotated to identify and classify various types of errors. "
|
338 |
+
"The dataset maintains the original questions but provides additional "
|
339 |
+
"error annotations and corrections based on expert review and verification "
|
340 |
+
"against credible sources."
|
341 |
+
),
|
342 |
+
citation="""@misc{gema2024mmlu,
|
343 |
+
title={Are We Done with MMLU?},
|
344 |
+
author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
|
345 |
+
year={2024},
|
346 |
+
eprint={2406.04127},
|
347 |
+
archivePrefix={arXiv},
|
348 |
+
primaryClass={cs.CL}
|
349 |
+
}""",
|
350 |
+
additional_info={
|
351 |
+
"size": "3,000 questions (100 per subject)",
|
352 |
+
"subjects": "30 MMLU subjects",
|
353 |
+
"license": "CC-BY-4.0",
|
354 |
+
"error_types": {
|
355 |
+
"Question Assessment": [
|
356 |
+
"Bad Question Clarity",
|
357 |
+
"Bad Options Clarity",
|
358 |
+
],
|
359 |
+
"Ground Truth Verification": [
|
360 |
+
"No Correct Answer",
|
361 |
+
"Multiple Correct Answers",
|
362 |
+
"Wrong Ground Truth",
|
363 |
+
],
|
364 |
+
},
|
365 |
+
"verification_process": "Expert review with source verification",
|
366 |
+
"base_dataset": "cais/mmlu",
|
367 |
+
},
|
368 |
+
)
|
369 |
+
|
370 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
371 |
+
"""Returns the recommended evaluation metrics for MMLU Redux dataset."""
|
372 |
+
return [
|
373 |
+
EvaluationMetric.create(
|
374 |
+
name="accuracy",
|
375 |
+
type="classification",
|
376 |
+
description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
|
377 |
+
implementation="evaluate.load('accuracy')",
|
378 |
+
primary=True,
|
379 |
+
),
|
380 |
+
EvaluationMetric.create(
|
381 |
+
name="subject_accuracy",
|
382 |
+
type="classification",
|
383 |
+
description="Per-subject accuracy scores across 30 subjects (100 questions each)",
|
384 |
+
implementation="custom_subject_accuracy",
|
385 |
+
primary=True,
|
386 |
+
),
|
387 |
+
EvaluationMetric.create(
|
388 |
+
name="question_clarity",
|
389 |
+
type="analysis",
|
390 |
+
description="Analysis of performance on questions with different clarity issues",
|
391 |
+
implementation="custom_clarity_analysis",
|
392 |
+
primary=False,
|
393 |
+
),
|
394 |
+
]
|
395 |
+
|
396 |
|
397 |
class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
398 |
"""Parser for the TMMLU+ dataset."""
|
|
|
487 |
prompt, raw_answer, raw_question, raw_choices, raw_answer, task
|
488 |
)
|
489 |
|
490 |
+
def get_dataset_description(self) -> DatasetDescription:
|
491 |
+
"""Returns description of the TMMLU+ dataset."""
|
492 |
+
return DatasetDescription.create(
|
493 |
+
name="Traditional Chinese Massive Multitask Language Understanding Plus (TMMLU+)",
|
494 |
+
purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
|
495 |
+
source="https://huggingface.co/datasets/ikala/tmmluplus",
|
496 |
+
language="Traditional Chinese",
|
497 |
+
format="Multiple choice questions with four options (A, B, C, D)",
|
498 |
+
characteristics=(
|
499 |
+
"A comprehensive evaluation benchmark featuring 66 subjects from elementary "
|
500 |
+
"to professional level. The dataset is six times larger than the original TMMLU "
|
501 |
+
"and provides more balanced subject coverage. Includes benchmark results from "
|
502 |
+
"both closed-source models and 20 open-weight Chinese language models with "
|
503 |
+
"parameters ranging from 1.8B to 72B."
|
504 |
+
),
|
505 |
+
citation="""@article{ikala2024improved,
|
506 |
+
title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
|
507 |
+
author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
|
508 |
+
journal={arXiv preprint arXiv:2403.01858},
|
509 |
+
year={2024}
|
510 |
+
}""",
|
511 |
+
additional_info={
|
512 |
+
"subjects": "66 diverse subjects",
|
513 |
+
"difficulty_levels": ["Elementary", "Secondary", "Professional"],
|
514 |
+
"model_benchmarks": {
|
515 |
+
"model_types": ["Closed-source models", "Open-weight Chinese LLMs"],
|
516 |
+
"parameter_range": "1.8B - 72B",
|
517 |
+
},
|
518 |
+
"comparison": "6x larger than original TMMLU",
|
519 |
+
"script": "Traditional Chinese",
|
520 |
+
},
|
521 |
+
)
|
522 |
+
|
523 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
524 |
+
"""Returns the recommended evaluation metrics for TMMLU+ dataset."""
|
525 |
+
return [
|
526 |
+
EvaluationMetric.create(
|
527 |
+
name="accuracy",
|
528 |
+
type="classification",
|
529 |
+
description="Overall percentage of correctly answered multiple-choice questions",
|
530 |
+
implementation="evaluate.load('accuracy')",
|
531 |
+
primary=True,
|
532 |
+
),
|
533 |
+
EvaluationMetric.create(
|
534 |
+
name="subject_accuracy",
|
535 |
+
type="classification",
|
536 |
+
description="Per-subject accuracy scores across all 66 subjects",
|
537 |
+
implementation="custom_subject_accuracy",
|
538 |
+
primary=True,
|
539 |
+
),
|
540 |
+
EvaluationMetric.create(
|
541 |
+
name="difficulty_analysis",
|
542 |
+
type="classification",
|
543 |
+
description="Performance analysis across different difficulty levels (elementary to professional)",
|
544 |
+
implementation="custom_difficulty_analysis",
|
545 |
+
primary=False,
|
546 |
+
),
|
547 |
+
]
|
548 |
+
|
549 |
|
550 |
class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
551 |
"""Parser for the MMLU Pro dataset."""
|
|
|
614 |
prompt, answer_letter, raw_question, raw_choices, raw_answer, final_task
|
615 |
)
|
616 |
|
617 |
+
def get_dataset_description(self) -> DatasetDescription:
|
618 |
+
"""Returns description of the MMLU Pro dataset."""
|
619 |
+
return DatasetDescription.create(
|
620 |
+
name="MMLU Pro",
|
621 |
+
purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
|
622 |
+
source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
|
623 |
+
language="English",
|
624 |
+
format="Multiple choice questions with up to 10 options (expanded from original 4)",
|
625 |
+
characteristics=(
|
626 |
+
"A more challenging version of MMLU containing 12K complex questions across various "
|
627 |
+
"disciplines. Features increased number of options (up to 10), stronger focus on "
|
628 |
+
"reasoning over pure knowledge, and reduced sensitivity to prompt variations. "
|
629 |
+
"Questions are sourced from original MMLU, STEM websites, TheoremQA, and SciBench, "
|
630 |
+
"with expert review and GPT-4 assisted distractor generation."
|
631 |
+
),
|
632 |
+
citation="""@article{wang2024mmlu,
|
633 |
+
title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
|
634 |
+
author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
|
635 |
+
journal={arXiv preprint arXiv:2406.01574},
|
636 |
+
year={2024}
|
637 |
+
}""",
|
638 |
+
additional_info={
|
639 |
+
"size": "12,000 complex questions",
|
640 |
+
"options": "Up to 10 choices per question",
|
641 |
+
"sources": [
|
642 |
+
"Original MMLU (filtered)",
|
643 |
+
"STEM Website",
|
644 |
+
"TheoremQA",
|
645 |
+
"SciBench",
|
646 |
+
],
|
647 |
+
"enhanced_subjects": [
|
648 |
+
"Biology",
|
649 |
+
"Business",
|
650 |
+
"Chemistry",
|
651 |
+
"Computer Science",
|
652 |
+
"Economics",
|
653 |
+
"Engineering",
|
654 |
+
"Math",
|
655 |
+
"Physics",
|
656 |
+
"Psychology",
|
657 |
+
],
|
658 |
+
"construction_process": [
|
659 |
+
"Initial MMLU filtering",
|
660 |
+
"Question collection from multiple sources",
|
661 |
+
"GPT-4 assisted option augmentation",
|
662 |
+
"Expert review by 10+ experts",
|
663 |
+
],
|
664 |
+
"prompt_sensitivity": "2% (reduced from 4-5% in MMLU)",
|
665 |
+
"reasoning_improvement": "20% higher CoT performance compared to PPL",
|
666 |
+
},
|
667 |
+
)
|
668 |
+
|
669 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
670 |
+
"""Returns the recommended evaluation metrics for MMLU Pro dataset."""
|
671 |
+
return [
|
672 |
+
EvaluationMetric.create(
|
673 |
+
name="accuracy",
|
674 |
+
type="classification",
|
675 |
+
description="Proportion of correctly answered multiple-choice questions (exact match)",
|
676 |
+
implementation="evaluate.load('accuracy')",
|
677 |
+
primary=True,
|
678 |
+
),
|
679 |
+
EvaluationMetric.create(
|
680 |
+
name="subject_accuracy",
|
681 |
+
type="classification",
|
682 |
+
description="Per-subject accuracy scores with focus on enhanced subjects",
|
683 |
+
implementation="custom_subject_accuracy",
|
684 |
+
primary=True,
|
685 |
+
),
|
686 |
+
EvaluationMetric.create(
|
687 |
+
name="reasoning_analysis",
|
688 |
+
type="analysis",
|
689 |
+
description="Comparison of Chain-of-Thought vs standard PPL performance",
|
690 |
+
implementation="custom_reasoning_analysis",
|
691 |
+
primary=True,
|
692 |
+
),
|
693 |
+
EvaluationMetric.create(
|
694 |
+
name="prompt_robustness",
|
695 |
+
type="analysis",
|
696 |
+
description="Analysis of performance stability across different prompt variations",
|
697 |
+
implementation="custom_prompt_sensitivity",
|
698 |
+
primary=False,
|
699 |
+
),
|
700 |
+
]
|
701 |
+
|
702 |
|
703 |
if __name__ == "__main__":
|
704 |
# Example usage of MMLU Pro parser
|
tests/test_mmlu_parser.py
CHANGED
@@ -218,3 +218,97 @@ def test_different_splits_parsing(base_parser):
|
|
218 |
assert test_count > 0
|
219 |
assert val_count > 0
|
220 |
assert test_count != val_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
assert test_count > 0
|
219 |
assert val_count > 0
|
220 |
assert test_count != val_count
|
221 |
+
|
222 |
+
|
223 |
+
def test_base_mmlu_dataset_description(base_parser):
|
224 |
+
"""Test dataset description for base MMLU."""
|
225 |
+
description = base_parser.get_dataset_description()
|
226 |
+
|
227 |
+
assert description.name == "Massive Multitask Language Understanding (MMLU)"
|
228 |
+
assert "cais/mmlu" in description.source
|
229 |
+
assert description.language == "English"
|
230 |
+
|
231 |
+
# Check characteristics
|
232 |
+
assert "57 subjects" in description.characteristics.lower()
|
233 |
+
|
234 |
+
# Check citation
|
235 |
+
assert "hendryckstest2021" in description.citation
|
236 |
+
|
237 |
+
|
238 |
+
def test_mmlu_redux_dataset_description(redux_parser):
|
239 |
+
"""Test dataset description for MMLU Redux."""
|
240 |
+
description = redux_parser.get_dataset_description()
|
241 |
+
|
242 |
+
assert description.name == "MMLU Redux"
|
243 |
+
assert "manually re-annotated" in description.purpose.lower()
|
244 |
+
assert "edinburgh-dawg/mmlu-redux" in description.source
|
245 |
+
assert description.language == "English"
|
246 |
+
|
247 |
+
# Check characteristics
|
248 |
+
assert "3,000" in description.characteristics
|
249 |
+
|
250 |
+
|
251 |
+
def test_tmmlu_plus_dataset_description(tmmlu_parser):
|
252 |
+
"""Test dataset description for TMMLU+."""
|
253 |
+
description = tmmlu_parser.get_dataset_description()
|
254 |
+
|
255 |
+
assert "ikala/tmmluplus" in description.source
|
256 |
+
assert description.language == "Traditional Chinese"
|
257 |
+
|
258 |
+
# Check characteristics
|
259 |
+
assert "66 subjects" in description.characteristics.lower()
|
260 |
+
|
261 |
+
# Check citation
|
262 |
+
assert "ikala2024improved" in description.citation
|
263 |
+
|
264 |
+
|
265 |
+
def test_mmlu_pro_dataset_description(mmlu_pro_parser):
|
266 |
+
"""Test dataset description for MMLU Pro."""
|
267 |
+
description = mmlu_pro_parser.get_dataset_description()
|
268 |
+
|
269 |
+
assert description.name == "MMLU Pro"
|
270 |
+
assert "challenging" in description.purpose.lower()
|
271 |
+
assert "TIGER-Lab/MMLU-Pro" in description.source
|
272 |
+
assert description.language == "English"
|
273 |
+
|
274 |
+
|
275 |
+
def test_base_mmlu_evaluation_metrics(base_parser):
|
276 |
+
"""Test evaluation metrics for base MMLU."""
|
277 |
+
metrics = base_parser.get_evaluation_metrics()
|
278 |
+
|
279 |
+
assert len(metrics) >= 3
|
280 |
+
metric_names = {m.name for m in metrics}
|
281 |
+
|
282 |
+
assert "accuracy" in metric_names
|
283 |
+
assert "subject_accuracy" in metric_names
|
284 |
+
assert "category_accuracy" in metric_names
|
285 |
+
|
286 |
+
accuracy_metric = next(m for m in metrics if m.name == "accuracy")
|
287 |
+
assert accuracy_metric.type == "classification"
|
288 |
+
assert accuracy_metric.primary is True
|
289 |
+
assert "multiple-choice" in accuracy_metric.description.lower()
|
290 |
+
|
291 |
+
|
292 |
+
def test_mmlu_redux_evaluation_metrics(redux_parser):
|
293 |
+
"""Test evaluation metrics for MMLU Redux."""
|
294 |
+
metrics = redux_parser.get_evaluation_metrics()
|
295 |
+
|
296 |
+
metric_names = {m.name for m in metrics}
|
297 |
+
assert "question_clarity" in metric_names
|
298 |
+
|
299 |
+
|
300 |
+
def test_tmmlu_plus_evaluation_metrics(tmmlu_parser):
|
301 |
+
"""Test evaluation metrics for TMMLU+."""
|
302 |
+
metrics = tmmlu_parser.get_evaluation_metrics()
|
303 |
+
|
304 |
+
metric_names = {m.name for m in metrics}
|
305 |
+
assert "difficulty_analysis" in metric_names
|
306 |
+
|
307 |
+
|
308 |
+
def test_mmlu_pro_evaluation_metrics(mmlu_pro_parser):
|
309 |
+
"""Test evaluation metrics for MMLU Pro."""
|
310 |
+
metrics = mmlu_pro_parser.get_evaluation_metrics()
|
311 |
+
|
312 |
+
metric_names = {m.name for m in metrics}
|
313 |
+
assert "reasoning_analysis" in metric_names
|
314 |
+
assert "prompt_robustness" in metric_names
|