JeffYang52415 commited on
Commit
793be05
·
unverified ·
1 Parent(s): 3aaa6f0

refactor: mmlu parser

Browse files
llmdataparser/mmlu_parser.py CHANGED
@@ -1,7 +1,12 @@
1
  from dataclasses import dataclass
2
  from typing import Any, Final
3
 
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 
 
 
 
 
5
  from llmdataparser.prompts import MMLU_PRO_SYSTEM_PROMPT, MMLU_SYSTEM_PROMPT
6
 
7
  MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
@@ -200,6 +205,86 @@ class BaseMMLUDatasetParser(MMLUDatasetParser):
200
  "world_religions",
201
  ]
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  class MMLUReduxDatasetParser(MMLUDatasetParser):
205
  """Parser for the MMLU Redux dataset."""
@@ -239,6 +324,75 @@ class MMLUReduxDatasetParser(MMLUDatasetParser):
239
  "virology",
240
  ]
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  class TMMLUPlusDatasetParser(MMLUDatasetParser):
244
  """Parser for the TMMLU+ dataset."""
@@ -333,6 +487,65 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
333
  prompt, raw_answer, raw_question, raw_choices, raw_answer, task
334
  )
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
338
  """Parser for the MMLU Pro dataset."""
@@ -401,6 +614,91 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
401
  prompt, answer_letter, raw_question, raw_choices, raw_answer, final_task
402
  )
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
  if __name__ == "__main__":
406
  # Example usage of MMLU Pro parser
 
1
  from dataclasses import dataclass
2
  from typing import Any, Final
3
 
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
  from llmdataparser.prompts import MMLU_PRO_SYSTEM_PROMPT, MMLU_SYSTEM_PROMPT
11
 
12
  MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
 
205
  "world_religions",
206
  ]
207
 
208
+ def get_dataset_description(self) -> DatasetDescription:
209
+ """Returns a description of the MMLU dataset."""
210
+ return DatasetDescription.create(
211
+ name="Massive Multitask Language Understanding (MMLU)",
212
+ purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
213
+ source="https://huggingface.co/datasets/cais/mmlu",
214
+ language="English",
215
+ format="Multiple choice questions with four options (A, B, C, D)",
216
+ characteristics=(
217
+ "Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
218
+ "and other essential areas of knowledge. The test includes 57 subjects such as "
219
+ "elementary mathematics, US history, computer science, and law. Success on this test "
220
+ "requires both extensive world knowledge and strong problem-solving capabilities."
221
+ ),
222
+ citation="""@article{hendryckstest2021,
223
+ title={Measuring Massive Multitask Language Understanding},
224
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
225
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
226
+ year={2021}
227
+ }
228
+ @article{hendrycks2021ethics,
229
+ title={Aligning AI With Shared Human Values},
230
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
231
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
232
+ year={2021}
233
+ }""",
234
+ additional_info={
235
+ "subjects": "57 tasks/subjects",
236
+ "categories": [
237
+ "Humanities",
238
+ "Social Sciences",
239
+ "Hard Sciences",
240
+ "Other",
241
+ ],
242
+ "example_subjects": [
243
+ "Elementary Mathematics",
244
+ "US History",
245
+ "Computer Science",
246
+ "Law",
247
+ ],
248
+ "requirements": [
249
+ "Extensive world knowledge",
250
+ "Problem solving ability",
251
+ ],
252
+ },
253
+ )
254
+
255
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
256
+ """Returns the recommended evaluation metrics for MMLU dataset."""
257
+ return [
258
+ EvaluationMetric.create(
259
+ name="accuracy",
260
+ type="classification",
261
+ description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
262
+ implementation="evaluate.load('accuracy')",
263
+ primary=True,
264
+ ),
265
+ EvaluationMetric.create(
266
+ name="subject_accuracy",
267
+ type="classification",
268
+ description="Per-subject accuracy scores across all 57 tasks",
269
+ implementation="custom_subject_accuracy",
270
+ primary=True,
271
+ ),
272
+ EvaluationMetric.create(
273
+ name="category_accuracy",
274
+ type="classification",
275
+ description="Accuracy grouped by major categories (Humanities, Social Sciences, Hard Sciences, Other)",
276
+ implementation="custom_category_accuracy",
277
+ primary=True,
278
+ ),
279
+ EvaluationMetric.create(
280
+ name="task_correlation",
281
+ type="analysis",
282
+ description="Analysis of performance correlations between different subjects/tasks",
283
+ implementation="custom_task_correlation",
284
+ primary=False,
285
+ ),
286
+ ]
287
+
288
 
289
  class MMLUReduxDatasetParser(MMLUDatasetParser):
290
  """Parser for the MMLU Redux dataset."""
 
324
  "virology",
325
  ]
326
 
327
+ def get_dataset_description(self) -> DatasetDescription:
328
+ """Returns description of the MMLU Redux dataset."""
329
+ return DatasetDescription.create(
330
+ name="MMLU Redux",
331
+ purpose="Provide a manually re-annotated subset of MMLU with error analysis and corrections",
332
+ source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
333
+ language="English",
334
+ format="Multiple choice questions with four options (A, B, C, D)",
335
+ characteristics=(
336
+ "A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
337
+ "manually re-annotated to identify and classify various types of errors. "
338
+ "The dataset maintains the original questions but provides additional "
339
+ "error annotations and corrections based on expert review and verification "
340
+ "against credible sources."
341
+ ),
342
+ citation="""@misc{gema2024mmlu,
343
+ title={Are We Done with MMLU?},
344
+ author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
345
+ year={2024},
346
+ eprint={2406.04127},
347
+ archivePrefix={arXiv},
348
+ primaryClass={cs.CL}
349
+ }""",
350
+ additional_info={
351
+ "size": "3,000 questions (100 per subject)",
352
+ "subjects": "30 MMLU subjects",
353
+ "license": "CC-BY-4.0",
354
+ "error_types": {
355
+ "Question Assessment": [
356
+ "Bad Question Clarity",
357
+ "Bad Options Clarity",
358
+ ],
359
+ "Ground Truth Verification": [
360
+ "No Correct Answer",
361
+ "Multiple Correct Answers",
362
+ "Wrong Ground Truth",
363
+ ],
364
+ },
365
+ "verification_process": "Expert review with source verification",
366
+ "base_dataset": "cais/mmlu",
367
+ },
368
+ )
369
+
370
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
371
+ """Returns the recommended evaluation metrics for MMLU Redux dataset."""
372
+ return [
373
+ EvaluationMetric.create(
374
+ name="accuracy",
375
+ type="classification",
376
+ description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
377
+ implementation="evaluate.load('accuracy')",
378
+ primary=True,
379
+ ),
380
+ EvaluationMetric.create(
381
+ name="subject_accuracy",
382
+ type="classification",
383
+ description="Per-subject accuracy scores across 30 subjects (100 questions each)",
384
+ implementation="custom_subject_accuracy",
385
+ primary=True,
386
+ ),
387
+ EvaluationMetric.create(
388
+ name="question_clarity",
389
+ type="analysis",
390
+ description="Analysis of performance on questions with different clarity issues",
391
+ implementation="custom_clarity_analysis",
392
+ primary=False,
393
+ ),
394
+ ]
395
+
396
 
397
  class TMMLUPlusDatasetParser(MMLUDatasetParser):
398
  """Parser for the TMMLU+ dataset."""
 
487
  prompt, raw_answer, raw_question, raw_choices, raw_answer, task
488
  )
489
 
490
+ def get_dataset_description(self) -> DatasetDescription:
491
+ """Returns description of the TMMLU+ dataset."""
492
+ return DatasetDescription.create(
493
+ name="Traditional Chinese Massive Multitask Language Understanding Plus (TMMLU+)",
494
+ purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
495
+ source="https://huggingface.co/datasets/ikala/tmmluplus",
496
+ language="Traditional Chinese",
497
+ format="Multiple choice questions with four options (A, B, C, D)",
498
+ characteristics=(
499
+ "A comprehensive evaluation benchmark featuring 66 subjects from elementary "
500
+ "to professional level. The dataset is six times larger than the original TMMLU "
501
+ "and provides more balanced subject coverage. Includes benchmark results from "
502
+ "both closed-source models and 20 open-weight Chinese language models with "
503
+ "parameters ranging from 1.8B to 72B."
504
+ ),
505
+ citation="""@article{ikala2024improved,
506
+ title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
507
+ author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
508
+ journal={arXiv preprint arXiv:2403.01858},
509
+ year={2024}
510
+ }""",
511
+ additional_info={
512
+ "subjects": "66 diverse subjects",
513
+ "difficulty_levels": ["Elementary", "Secondary", "Professional"],
514
+ "model_benchmarks": {
515
+ "model_types": ["Closed-source models", "Open-weight Chinese LLMs"],
516
+ "parameter_range": "1.8B - 72B",
517
+ },
518
+ "comparison": "6x larger than original TMMLU",
519
+ "script": "Traditional Chinese",
520
+ },
521
+ )
522
+
523
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
524
+ """Returns the recommended evaluation metrics for TMMLU+ dataset."""
525
+ return [
526
+ EvaluationMetric.create(
527
+ name="accuracy",
528
+ type="classification",
529
+ description="Overall percentage of correctly answered multiple-choice questions",
530
+ implementation="evaluate.load('accuracy')",
531
+ primary=True,
532
+ ),
533
+ EvaluationMetric.create(
534
+ name="subject_accuracy",
535
+ type="classification",
536
+ description="Per-subject accuracy scores across all 66 subjects",
537
+ implementation="custom_subject_accuracy",
538
+ primary=True,
539
+ ),
540
+ EvaluationMetric.create(
541
+ name="difficulty_analysis",
542
+ type="classification",
543
+ description="Performance analysis across different difficulty levels (elementary to professional)",
544
+ implementation="custom_difficulty_analysis",
545
+ primary=False,
546
+ ),
547
+ ]
548
+
549
 
550
  class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
551
  """Parser for the MMLU Pro dataset."""
 
614
  prompt, answer_letter, raw_question, raw_choices, raw_answer, final_task
615
  )
616
 
617
+ def get_dataset_description(self) -> DatasetDescription:
618
+ """Returns description of the MMLU Pro dataset."""
619
+ return DatasetDescription.create(
620
+ name="MMLU Pro",
621
+ purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
622
+ source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
623
+ language="English",
624
+ format="Multiple choice questions with up to 10 options (expanded from original 4)",
625
+ characteristics=(
626
+ "A more challenging version of MMLU containing 12K complex questions across various "
627
+ "disciplines. Features increased number of options (up to 10), stronger focus on "
628
+ "reasoning over pure knowledge, and reduced sensitivity to prompt variations. "
629
+ "Questions are sourced from original MMLU, STEM websites, TheoremQA, and SciBench, "
630
+ "with expert review and GPT-4 assisted distractor generation."
631
+ ),
632
+ citation="""@article{wang2024mmlu,
633
+ title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
634
+ author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
635
+ journal={arXiv preprint arXiv:2406.01574},
636
+ year={2024}
637
+ }""",
638
+ additional_info={
639
+ "size": "12,000 complex questions",
640
+ "options": "Up to 10 choices per question",
641
+ "sources": [
642
+ "Original MMLU (filtered)",
643
+ "STEM Website",
644
+ "TheoremQA",
645
+ "SciBench",
646
+ ],
647
+ "enhanced_subjects": [
648
+ "Biology",
649
+ "Business",
650
+ "Chemistry",
651
+ "Computer Science",
652
+ "Economics",
653
+ "Engineering",
654
+ "Math",
655
+ "Physics",
656
+ "Psychology",
657
+ ],
658
+ "construction_process": [
659
+ "Initial MMLU filtering",
660
+ "Question collection from multiple sources",
661
+ "GPT-4 assisted option augmentation",
662
+ "Expert review by 10+ experts",
663
+ ],
664
+ "prompt_sensitivity": "2% (reduced from 4-5% in MMLU)",
665
+ "reasoning_improvement": "20% higher CoT performance compared to PPL",
666
+ },
667
+ )
668
+
669
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
670
+ """Returns the recommended evaluation metrics for MMLU Pro dataset."""
671
+ return [
672
+ EvaluationMetric.create(
673
+ name="accuracy",
674
+ type="classification",
675
+ description="Proportion of correctly answered multiple-choice questions (exact match)",
676
+ implementation="evaluate.load('accuracy')",
677
+ primary=True,
678
+ ),
679
+ EvaluationMetric.create(
680
+ name="subject_accuracy",
681
+ type="classification",
682
+ description="Per-subject accuracy scores with focus on enhanced subjects",
683
+ implementation="custom_subject_accuracy",
684
+ primary=True,
685
+ ),
686
+ EvaluationMetric.create(
687
+ name="reasoning_analysis",
688
+ type="analysis",
689
+ description="Comparison of Chain-of-Thought vs standard PPL performance",
690
+ implementation="custom_reasoning_analysis",
691
+ primary=True,
692
+ ),
693
+ EvaluationMetric.create(
694
+ name="prompt_robustness",
695
+ type="analysis",
696
+ description="Analysis of performance stability across different prompt variations",
697
+ implementation="custom_prompt_sensitivity",
698
+ primary=False,
699
+ ),
700
+ ]
701
+
702
 
703
  if __name__ == "__main__":
704
  # Example usage of MMLU Pro parser
tests/test_mmlu_parser.py CHANGED
@@ -218,3 +218,97 @@ def test_different_splits_parsing(base_parser):
218
  assert test_count > 0
219
  assert val_count > 0
220
  assert test_count != val_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  assert test_count > 0
219
  assert val_count > 0
220
  assert test_count != val_count
221
+
222
+
223
+ def test_base_mmlu_dataset_description(base_parser):
224
+ """Test dataset description for base MMLU."""
225
+ description = base_parser.get_dataset_description()
226
+
227
+ assert description.name == "Massive Multitask Language Understanding (MMLU)"
228
+ assert "cais/mmlu" in description.source
229
+ assert description.language == "English"
230
+
231
+ # Check characteristics
232
+ assert "57 subjects" in description.characteristics.lower()
233
+
234
+ # Check citation
235
+ assert "hendryckstest2021" in description.citation
236
+
237
+
238
+ def test_mmlu_redux_dataset_description(redux_parser):
239
+ """Test dataset description for MMLU Redux."""
240
+ description = redux_parser.get_dataset_description()
241
+
242
+ assert description.name == "MMLU Redux"
243
+ assert "manually re-annotated" in description.purpose.lower()
244
+ assert "edinburgh-dawg/mmlu-redux" in description.source
245
+ assert description.language == "English"
246
+
247
+ # Check characteristics
248
+ assert "3,000" in description.characteristics
249
+
250
+
251
+ def test_tmmlu_plus_dataset_description(tmmlu_parser):
252
+ """Test dataset description for TMMLU+."""
253
+ description = tmmlu_parser.get_dataset_description()
254
+
255
+ assert "ikala/tmmluplus" in description.source
256
+ assert description.language == "Traditional Chinese"
257
+
258
+ # Check characteristics
259
+ assert "66 subjects" in description.characteristics.lower()
260
+
261
+ # Check citation
262
+ assert "ikala2024improved" in description.citation
263
+
264
+
265
+ def test_mmlu_pro_dataset_description(mmlu_pro_parser):
266
+ """Test dataset description for MMLU Pro."""
267
+ description = mmlu_pro_parser.get_dataset_description()
268
+
269
+ assert description.name == "MMLU Pro"
270
+ assert "challenging" in description.purpose.lower()
271
+ assert "TIGER-Lab/MMLU-Pro" in description.source
272
+ assert description.language == "English"
273
+
274
+
275
+ def test_base_mmlu_evaluation_metrics(base_parser):
276
+ """Test evaluation metrics for base MMLU."""
277
+ metrics = base_parser.get_evaluation_metrics()
278
+
279
+ assert len(metrics) >= 3
280
+ metric_names = {m.name for m in metrics}
281
+
282
+ assert "accuracy" in metric_names
283
+ assert "subject_accuracy" in metric_names
284
+ assert "category_accuracy" in metric_names
285
+
286
+ accuracy_metric = next(m for m in metrics if m.name == "accuracy")
287
+ assert accuracy_metric.type == "classification"
288
+ assert accuracy_metric.primary is True
289
+ assert "multiple-choice" in accuracy_metric.description.lower()
290
+
291
+
292
+ def test_mmlu_redux_evaluation_metrics(redux_parser):
293
+ """Test evaluation metrics for MMLU Redux."""
294
+ metrics = redux_parser.get_evaluation_metrics()
295
+
296
+ metric_names = {m.name for m in metrics}
297
+ assert "question_clarity" in metric_names
298
+
299
+
300
+ def test_tmmlu_plus_evaluation_metrics(tmmlu_parser):
301
+ """Test evaluation metrics for TMMLU+."""
302
+ metrics = tmmlu_parser.get_evaluation_metrics()
303
+
304
+ metric_names = {m.name for m in metrics}
305
+ assert "difficulty_analysis" in metric_names
306
+
307
+
308
+ def test_mmlu_pro_evaluation_metrics(mmlu_pro_parser):
309
+ """Test evaluation metrics for MMLU Pro."""
310
+ metrics = mmlu_pro_parser.get_evaluation_metrics()
311
+
312
+ metric_names = {m.name for m in metrics}
313
+ assert "reasoning_analysis" in metric_names
314
+ assert "prompt_robustness" in metric_names