Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

App Files Files Community

JeffYang52415 commited on Dec 29, 2024

Commit

fb32f8e

unverified ·

1 Parent(s): 6ed7950

feat: add description&metrics for bbh

Browse files

Files changed (2) hide show

llmdataparser/bbh_parser.py +63 -1
tests/test_bbh_parser.py +73 -0

llmdataparser/bbh_parser.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any, ClassVar
 from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 from llmdataparser.prompts import BBH_SYSTEM_PROMPT  # You'll need to create this
@@ -87,6 +87,68 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
             task_name=task,
         )
 if __name__ == "__main__":
     # Example usage

 from dataclasses import dataclass
+from typing import Any, ClassVar, Dict, List
 from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 from llmdataparser.prompts import BBH_SYSTEM_PROMPT  # You'll need to create this
             task_name=task,
         )
+    def get_dataset_description(self) -> Dict[str, str]:
+        """Returns a description of the Big Bench Hard dataset."""
+        return {
+            "name": "Big Bench Hard (BBH)",
+            "purpose": "A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
+            "source": "https://github.com/suzgunmirac/BIG-Bench-Hard",
+            "language": "English",
+            "size": "6.5k examples across 27 tasks (23 core + 4 related)",
+            "format": "Multiple choice questions with single correct answers",
+            "characteristics": (
+                "Tasks require complex multi-step reasoning and were selected based on "
+                "initial model performance below human baseline. Performance can be "
+                "significantly improved through chain-of-thought prompting. The dataset "
+                "includes 23 core tasks plus additional related tasks."
+            ),
+            "model_performance": (
+                "With chain-of-thought prompting, PaLM surpassed human performance on "
+                "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
+            ),
+            "citation": (
+                "@article{suzgun2022challenging,\n"
+                "  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
+                '  author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
+                "  journal={arXiv preprint arXiv:2210.09261},\n"
+                "  year={2022}\n"
+                "}"
+            ),
+        }
+    def get_evaluation_metrics(self) -> List[Dict[str, Any]]:
+        """Returns the recommended evaluation metrics for BBH dataset."""
+        return [
+            {
+                "name": "accuracy",
+                "type": "classification",
+                "description": "Proportion of exactly correct answers (after stripping parentheses)",
+                "implementation": "evaluate.load('accuracy')",
+                "primary": True,
+            },
+            {
+                "name": "human_eval_delta",
+                "type": "comparison",
+                "description": "Difference between model accuracy and average human-rater performance baseline",
+                "implementation": "custom_human_baseline_comparison",
+                "primary": True,
+            },
+            {
+                "name": "per_task_accuracy",
+                "type": "classification",
+                "description": "Accuracy broken down by individual reasoning tasks",
+                "implementation": "custom_task_accuracy",
+                "primary": False,
+            },
+            {
+                "name": "exact_match",
+                "type": "string_match",
+                "description": "Strict exact match between predicted and target answers",
+                "implementation": "evaluate.load('exact_match')",
+                "primary": False,
+            },
+        ]
 if __name__ == "__main__":
     # Example usage

tests/test_bbh_parser.py CHANGED Viewed

@@ -158,3 +158,76 @@ def test_different_tasks_parsing(bbh_parser, task_name):
     assert len(parsed_data) > 0
     assert all(entry.task_name == task_name for entry in parsed_data)
     assert all(isinstance(entry.answer, str) for entry in parsed_data)

     assert len(parsed_data) > 0
     assert all(entry.task_name == task_name for entry in parsed_data)
     assert all(isinstance(entry.answer, str) for entry in parsed_data)
+def test_get_evaluation_metrics(bbh_parser):
+    """Test evaluation metrics structure and content."""
+    metrics = bbh_parser.get_evaluation_metrics()
+    # Check basic structure
+    assert isinstance(metrics, list)
+    assert len(metrics) > 0
+    # Check each metric has required fields
+    required_fields = ["name", "type", "description", "implementation", "primary"]
+    for metric in metrics:
+        for field in required_fields:
+            assert field in metric, f"Missing field {field} in metric {metric['name']}"
+        # Check field types
+        assert isinstance(metric["name"], str)
+        assert isinstance(metric["type"], str)
+        assert isinstance(metric["description"], str)
+        assert isinstance(metric["implementation"], str)
+        assert isinstance(metric["primary"], bool)
+    # Check specific metrics exist
+    metric_names = {m["name"] for m in metrics}
+    expected_metrics = {
+        "accuracy",
+        "human_eval_delta",
+        "per_task_accuracy",
+        "exact_match",
+    }
+    assert expected_metrics.issubset(metric_names)
+    # Check primary metrics
+    primary_metrics = {m["name"] for m in metrics if m["primary"]}
+    assert "accuracy" in primary_metrics
+    assert "human_eval_delta" in primary_metrics
+def test_dataset_description_citation_format(bbh_parser):
+    """Test that the citation in dataset description is properly formatted."""
+    description = bbh_parser.get_dataset_description()
+    citation = description["citation"]
+    # Check citation structure
+    assert citation.startswith("@article{")
+    assert "title=" in citation
+    assert "author=" in citation
+    assert "journal=" in citation
+    assert "year=" in citation
+    # Check specific author formatting
+    assert "Suzgun, Mirac" in citation
+    assert "Wei, Jason" in citation
+    assert "and Wei, Jason" in citation  # Should be last author
+    assert "and and" not in citation  # No double "and"
+def test_evaluation_metrics_implementations(bbh_parser):
+    """Test that evaluation metric implementations are properly specified."""
+    metrics = bbh_parser.get_evaluation_metrics()
+    for metric in metrics:
+        impl = metric["implementation"]
+        if "evaluate.load" in impl:
+            # Check standard metric format
+            assert impl.startswith("evaluate.load('")
+            assert impl.endswith("')")
+        elif "custom_" in impl:
+            # Check custom metric format
+            assert impl.startswith("custom_")
+            assert len(impl) > 7  # More than just "custom_"