JeffYang52415 commited on
Commit
c3a92d2
·
unverified ·
1 Parent(s): b73a2c7

refactor: math parser

Browse files
.pre-commit-config.yaml CHANGED
@@ -1,11 +1,12 @@
1
  # .pre-commit-config.yaml
2
 
3
  repos:
4
- - repo: https://github.com/psf/black
5
- rev: 23.9.1
6
  hooks:
7
- - id: black
8
- additional_dependencies: ["typing-extensions>=4.8.0"]
 
9
  - repo: https://github.com/PyCQA/flake8
10
  rev: 6.1.0
11
  hooks:
@@ -64,15 +65,6 @@ repos:
64
  hooks:
65
  - id: prettier
66
  types_or: [markdown, yaml]
67
- - repo: https://github.com/astral-sh/ruff-pre-commit
68
- # Ruff version.
69
- rev: v0.4.4
70
- hooks:
71
- # Run the linter.
72
- - id: ruff
73
- args: [--fix]
74
- # Run the formatter.
75
- - id: ruff-format
76
  - repo: https://github.com/kynan/nbstripout
77
  rev: 0.5.0 # use the latest version
78
  hooks:
 
1
  # .pre-commit-config.yaml
2
 
3
  repos:
4
+ - repo: https://github.com/astral-sh/ruff-pre-commit
5
+ rev: v0.4.4
6
  hooks:
7
+ - id: ruff
8
+ args: [--fix]
9
+ - id: ruff-format
10
  - repo: https://github.com/PyCQA/flake8
11
  rev: 6.1.0
12
  hooks:
 
65
  hooks:
66
  - id: prettier
67
  types_or: [markdown, yaml]
 
 
 
 
 
 
 
 
 
68
  - repo: https://github.com/kynan/nbstripout
69
  rev: 0.5.0 # use the latest version
70
  hooks:
llmdataparser/math_parser.py CHANGED
@@ -1,7 +1,12 @@
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar
3
 
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 
 
 
 
 
5
 
6
 
7
  @dataclass(frozen=True, kw_only=True, slots=True)
@@ -49,9 +54,9 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
49
  "all",
50
  ]
51
  _default_task: ClassVar[str] = "all"
52
- _default_system_prompt: ClassVar[
53
- str
54
- ] = "Solve the following mathematics problem step by step:"
55
  _valid_levels: ClassVar[set[str]] = {
56
  f"Level {i}" for i in range(1, 6)
57
  } # Levels 1-5 are valid
@@ -84,6 +89,83 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
84
  solution=row["solution"],
85
  )
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  if __name__ == "__main__":
89
  # Example usage of MATH parser
 
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar
3
 
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
 
11
 
12
  @dataclass(frozen=True, kw_only=True, slots=True)
 
54
  "all",
55
  ]
56
  _default_task: ClassVar[str] = "all"
57
+ _default_system_prompt: ClassVar[str] = (
58
+ "Solve the following mathematics problem step by step:"
59
+ )
60
  _valid_levels: ClassVar[set[str]] = {
61
  f"Level {i}" for i in range(1, 6)
62
  } # Levels 1-5 are valid
 
89
  solution=row["solution"],
90
  )
91
 
92
+ def get_dataset_description(self) -> DatasetDescription:
93
+ """Returns description of the MATH dataset."""
94
+ return DatasetDescription.create(
95
+ name="MATH",
96
+ purpose="Measure mathematical problem-solving capabilities in machine learning models",
97
+ source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
98
+ language="English",
99
+ format="Competition mathematics problems with step-by-step solutions",
100
+ characteristics=(
101
+ "Collection of 12,500 challenging competition mathematics problems designed to "
102
+ "evaluate mathematical reasoning. Problems include step-by-step solutions that "
103
+ "can be used to teach models to generate answer derivations and explanations. "
104
+ "Problems are categorized by subject area and difficulty level (1-5)."
105
+ ),
106
+ citation="""@article{hendrycksmath2021,
107
+ title={Measuring Mathematical Problem Solving With the MATH Dataset},
108
+ author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
109
+ journal={NeurIPS},
110
+ year={2021}
111
+ }""",
112
+ additional_info={
113
+ "difficulty_levels": "1-5",
114
+ "topics": [
115
+ "algebra",
116
+ "geometry",
117
+ "calculus",
118
+ "prealgebra",
119
+ "intermediate_algebra",
120
+ "number_theory",
121
+ "precalculus",
122
+ ],
123
+ "size": "12,500 problems",
124
+ "evaluation_note": "Exact match equivalence calculated using sympy library",
125
+ "homepage": "https://github.com/hendrycks/math",
126
+ },
127
+ )
128
+
129
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
130
+ """Returns recommended evaluation metrics for MATH dataset."""
131
+ return [
132
+ EvaluationMetric.create(
133
+ name="symbolic_equivalence",
134
+ type="exact_match",
135
+ description="Verifies answer correctness using symbolic mathematics (e.g., sympy) to check mathematical equivalence.",
136
+ implementation="sympy_equivalence_checker",
137
+ primary=True,
138
+ ),
139
+ EvaluationMetric.create(
140
+ name="solution_presence",
141
+ type="text",
142
+ description="Ensures that a complete step-by-step solution is provided, demonstrating how the answer is derived.",
143
+ implementation="solution_completeness_checker",
144
+ primary=True,
145
+ ),
146
+ EvaluationMetric.create(
147
+ name="reasoning_validity",
148
+ type="text",
149
+ description="Evaluates the logical correctness and mathematical reasoning in the solution's derivation steps.",
150
+ implementation="reasoning_validator",
151
+ primary=True,
152
+ ),
153
+ EvaluationMetric.create(
154
+ name="mathematical_notation",
155
+ type="text",
156
+ description="Checks for the correct use of mathematical notation and symbolic representation to ensure clarity.",
157
+ implementation="notation_validator",
158
+ primary=False,
159
+ ),
160
+ EvaluationMetric.create(
161
+ name="solution_clarity",
162
+ type="text",
163
+ description="Assesses the clarity, readability, and coherence of the solution steps to enhance interpretability.",
164
+ implementation="clarity_scorer",
165
+ primary=False,
166
+ ),
167
+ ]
168
+
169
 
170
  if __name__ == "__main__":
171
  # Example usage of MATH parser
tests/test_math_parser.py CHANGED
@@ -198,3 +198,68 @@ def test_different_splits_parsing(math_parser):
198
  assert test_count > 0
199
  assert train_count > 0
200
  assert train_count != test_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  assert test_count > 0
199
  assert train_count > 0
200
  assert train_count != test_count
201
+
202
+
203
+ def test_get_dataset_description(math_parser):
204
+ """Test dataset description generation."""
205
+ description = math_parser.get_dataset_description()
206
+
207
+ assert description.name == "MATH"
208
+ assert "mathematical problem-solving" in description.purpose.lower()
209
+ assert "Hendrycks" in description.source
210
+ assert description.language == "English"
211
+ assert "competition mathematics problems" in description.format.lower()
212
+ assert "12,500" in description.characteristics
213
+ assert "step-by-step solutions" in description.characteristics.lower()
214
+ assert "hendrycksmath2021" in description.citation
215
+ assert "NeurIPS" in description.citation
216
+
217
+ # Check additional info
218
+ assert description.additional_info is not None
219
+ assert description.additional_info["difficulty_levels"] == "1-5"
220
+ assert "algebra" in description.additional_info["topics"]
221
+ assert "geometry" in description.additional_info["topics"]
222
+ assert description.additional_info["size"] == "12,500 problems"
223
+ assert "sympy" in description.additional_info["evaluation_note"].lower()
224
+ assert "github.com/hendrycks/math" in description.additional_info["homepage"]
225
+
226
+
227
+ def test_get_evaluation_metrics(math_parser):
228
+ """Test evaluation metrics generation."""
229
+ metrics = math_parser.get_evaluation_metrics()
230
+
231
+ # Check total number of metrics
232
+ assert len(metrics) == 5
233
+
234
+ # Check primary metrics
235
+ primary_metrics = [m for m in metrics if m.primary]
236
+ assert len(primary_metrics) == 3
237
+
238
+ # Verify specific metrics exist with correct properties
239
+ metric_names = {m.name for m in metrics}
240
+ assert "symbolic_equivalence" in metric_names
241
+ assert "solution_presence" in metric_names
242
+ assert "reasoning_validity" in metric_names
243
+ assert "mathematical_notation" in metric_names
244
+ assert "solution_clarity" in metric_names
245
+
246
+ # Check specific metric properties
247
+ symbolic_metric = next(m for m in metrics if m.name == "symbolic_equivalence")
248
+ assert symbolic_metric.type == "exact_match"
249
+ assert symbolic_metric.primary is True
250
+ assert "sympy" in symbolic_metric.description.lower()
251
+ assert "equivalence" in symbolic_metric.description.lower()
252
+
253
+ solution_metric = next(m for m in metrics if m.name == "solution_presence")
254
+ assert solution_metric.type == "text"
255
+ assert solution_metric.primary is True
256
+ assert "step-by-step" in solution_metric.description.lower()
257
+
258
+ reasoning_metric = next(m for m in metrics if m.name == "reasoning_validity")
259
+ assert reasoning_metric.type == "text"
260
+ assert reasoning_metric.primary is True
261
+ assert "mathematical reasoning" in reasoning_metric.description.lower()
262
+
263
+ # Check non-primary metrics
264
+ non_primary_metrics = {m.name for m in metrics if not m.primary}
265
+ assert non_primary_metrics == {"mathematical_notation", "solution_clarity"}