Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

App Files Files Community

JeffYang52415 commited on Dec 29, 2024

Commit

c3a92d2

unverified ·

1 Parent(s): b73a2c7

refactor: math parser

Browse files

Files changed (3) hide show

.pre-commit-config.yaml +5 -13
llmdataparser/math_parser.py +86 -4
tests/test_math_parser.py +65 -0

.pre-commit-config.yaml CHANGED Viewed

@@ -1,11 +1,12 @@
 # .pre-commit-config.yaml
 repos:
-  - repo: https://github.com/psf/black
-    rev: 23.9.1
     hooks:
-      - id: black
-        additional_dependencies: ["typing-extensions>=4.8.0"]
   - repo: https://github.com/PyCQA/flake8
     rev: 6.1.0
     hooks:
@@ -64,15 +65,6 @@ repos:
     hooks:
       - id: prettier
         types_or: [markdown, yaml]
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.4.4
-    hooks:
-      # Run the linter.
-      - id: ruff
-        args: [--fix]
-      # Run the formatter.
-      - id: ruff-format
   - repo: https://github.com/kynan/nbstripout
     rev: 0.5.0 # use the latest version
     hooks:

 # .pre-commit-config.yaml
 repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.4.4
     hooks:
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format
   - repo: https://github.com/PyCQA/flake8
     rev: 6.1.0
     hooks:
     hooks:
       - id: prettier
         types_or: [markdown, yaml]
   - repo: https://github.com/kynan/nbstripout
     rev: 0.5.0 # use the latest version
     hooks:

llmdataparser/math_parser.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from dataclasses import dataclass
 from typing import Any, ClassVar
-from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 @dataclass(frozen=True, kw_only=True, slots=True)
@@ -49,9 +54,9 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
         "all",
     ]
     _default_task: ClassVar[str] = "all"
-    _default_system_prompt: ClassVar[
-        str
-    ] = "Solve the following mathematics problem step by step:"
     _valid_levels: ClassVar[set[str]] = {
         f"Level {i}" for i in range(1, 6)
     }  # Levels 1-5 are valid
@@ -84,6 +89,83 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
             solution=row["solution"],
         )
 if __name__ == "__main__":
     # Example usage of MATH parser

 from dataclasses import dataclass
 from typing import Any, ClassVar
+from llmdataparser.base_parser import (
+    DatasetDescription,
+    EvaluationMetric,
+    HuggingFaceDatasetParser,
+    HuggingFaceParseEntry,
+)
 @dataclass(frozen=True, kw_only=True, slots=True)
         "all",
     ]
     _default_task: ClassVar[str] = "all"
+    _default_system_prompt: ClassVar[str] = (
+        "Solve the following mathematics problem step by step:"
+    )
     _valid_levels: ClassVar[set[str]] = {
         f"Level {i}" for i in range(1, 6)
     }  # Levels 1-5 are valid
             solution=row["solution"],
         )
+    def get_dataset_description(self) -> DatasetDescription:
+        """Returns description of the MATH dataset."""
+        return DatasetDescription.create(
+            name="MATH",
+            purpose="Measure mathematical problem-solving capabilities in machine learning models",
+            source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
+            language="English",
+            format="Competition mathematics problems with step-by-step solutions",
+            characteristics=(
+                "Collection of 12,500 challenging competition mathematics problems designed to "
+                "evaluate mathematical reasoning. Problems include step-by-step solutions that "
+                "can be used to teach models to generate answer derivations and explanations. "
+                "Problems are categorized by subject area and difficulty level (1-5)."
+            ),
+            citation="""@article{hendrycksmath2021,
+                title={Measuring Mathematical Problem Solving With the MATH Dataset},
+                author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+                journal={NeurIPS},
+                year={2021}
+            }""",
+            additional_info={
+                "difficulty_levels": "1-5",
+                "topics": [
+                    "algebra",
+                    "geometry",
+                    "calculus",
+                    "prealgebra",
+                    "intermediate_algebra",
+                    "number_theory",
+                    "precalculus",
+                ],
+                "size": "12,500 problems",
+                "evaluation_note": "Exact match equivalence calculated using sympy library",
+                "homepage": "https://github.com/hendrycks/math",
+            },
+        )
+    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
+        """Returns recommended evaluation metrics for MATH dataset."""
+        return [
+            EvaluationMetric.create(
+                name="symbolic_equivalence",
+                type="exact_match",
+                description="Verifies answer correctness using symbolic mathematics (e.g., sympy) to check mathematical equivalence.",
+                implementation="sympy_equivalence_checker",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="solution_presence",
+                type="text",
+                description="Ensures that a complete step-by-step solution is provided, demonstrating how the answer is derived.",
+                implementation="solution_completeness_checker",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="reasoning_validity",
+                type="text",
+                description="Evaluates the logical correctness and mathematical reasoning in the solution's derivation steps.",
+                implementation="reasoning_validator",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="mathematical_notation",
+                type="text",
+                description="Checks for the correct use of mathematical notation and symbolic representation to ensure clarity.",
+                implementation="notation_validator",
+                primary=False,
+            ),
+            EvaluationMetric.create(
+                name="solution_clarity",
+                type="text",
+                description="Assesses the clarity, readability, and coherence of the solution steps to enhance interpretability.",
+                implementation="clarity_scorer",
+                primary=False,
+            ),
+        ]
 if __name__ == "__main__":
     # Example usage of MATH parser

tests/test_math_parser.py CHANGED Viewed

@@ -198,3 +198,68 @@ def test_different_splits_parsing(math_parser):
     assert test_count > 0
     assert train_count > 0
     assert train_count != test_count

     assert test_count > 0
     assert train_count > 0
     assert train_count != test_count
+def test_get_dataset_description(math_parser):
+    """Test dataset description generation."""
+    description = math_parser.get_dataset_description()
+    assert description.name == "MATH"
+    assert "mathematical problem-solving" in description.purpose.lower()
+    assert "Hendrycks" in description.source
+    assert description.language == "English"
+    assert "competition mathematics problems" in description.format.lower()
+    assert "12,500" in description.characteristics
+    assert "step-by-step solutions" in description.characteristics.lower()
+    assert "hendrycksmath2021" in description.citation
+    assert "NeurIPS" in description.citation
+    # Check additional info
+    assert description.additional_info is not None
+    assert description.additional_info["difficulty_levels"] == "1-5"
+    assert "algebra" in description.additional_info["topics"]
+    assert "geometry" in description.additional_info["topics"]
+    assert description.additional_info["size"] == "12,500 problems"
+    assert "sympy" in description.additional_info["evaluation_note"].lower()
+    assert "github.com/hendrycks/math" in description.additional_info["homepage"]
+def test_get_evaluation_metrics(math_parser):
+    """Test evaluation metrics generation."""
+    metrics = math_parser.get_evaluation_metrics()
+    # Check total number of metrics
+    assert len(metrics) == 5
+    # Check primary metrics
+    primary_metrics = [m for m in metrics if m.primary]
+    assert len(primary_metrics) == 3
+    # Verify specific metrics exist with correct properties
+    metric_names = {m.name for m in metrics}
+    assert "symbolic_equivalence" in metric_names
+    assert "solution_presence" in metric_names
+    assert "reasoning_validity" in metric_names
+    assert "mathematical_notation" in metric_names
+    assert "solution_clarity" in metric_names
+    # Check specific metric properties
+    symbolic_metric = next(m for m in metrics if m.name == "symbolic_equivalence")
+    assert symbolic_metric.type == "exact_match"
+    assert symbolic_metric.primary is True
+    assert "sympy" in symbolic_metric.description.lower()
+    assert "equivalence" in symbolic_metric.description.lower()
+    solution_metric = next(m for m in metrics if m.name == "solution_presence")
+    assert solution_metric.type == "text"
+    assert solution_metric.primary is True
+    assert "step-by-step" in solution_metric.description.lower()
+    reasoning_metric = next(m for m in metrics if m.name == "reasoning_validity")
+    assert reasoning_metric.type == "text"
+    assert reasoning_metric.primary is True
+    assert "mathematical reasoning" in reasoning_metric.description.lower()
+    # Check non-primary metrics
+    non_primary_metrics = {m.name for m in metrics if not m.primary}
+    assert non_primary_metrics == {"mathematical_notation", "solution_clarity"}