JeffYang52415 commited on
Commit
70da483
·
unverified ·
1 Parent(s): e5427e0

refactor: humaneval parser

Browse files
llmdataparser/humaneval_parser.py CHANGED
@@ -1,7 +1,12 @@
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar
3
 
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 
 
 
 
 
5
  from llmdataparser.prompts import HUMANEVAL_SYSTEM_PROMPT
6
 
7
 
@@ -75,9 +80,72 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
75
  task_name=task, # Guarantee non-None
76
  )
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
80
- """Parser for the HumanEval dataset."""
81
 
82
  _data_source: ClassVar[str] = "evalplus/humanevalplus"
83
  _default_task: ClassVar[str] = "default"
@@ -110,6 +178,70 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
110
  task_name=task, # task is guaranteed to be str from _get_current_task
111
  )
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  if __name__ == "__main__":
115
  # Example usage
 
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar
3
 
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
  from llmdataparser.prompts import HUMANEVAL_SYSTEM_PROMPT
11
 
12
 
 
80
  task_name=task, # Guarantee non-None
81
  )
82
 
83
+ def get_dataset_description(self) -> DatasetDescription:
84
+ """Returns description of the HumanEval dataset."""
85
+ return DatasetDescription.create(
86
+ name="HumanEval",
87
+ purpose="Evaluate code generation capabilities through Python programming tasks",
88
+ source="OpenAI",
89
+ language="Python",
90
+ format="Function signatures with docstrings and unit tests",
91
+ characteristics=(
92
+ "Collection of 164 hand-written Python programming problems. Each problem "
93
+ "includes a function signature, docstring, example test cases, and hidden unit "
94
+ "tests. Problems test basic programming, algorithms, and data structure skills"
95
+ ),
96
+ citation="""@article{chen2021codex,
97
+ title={Evaluating Large Language Models Trained on Code},
98
+ author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
99
+ year={2021},
100
+ eprint={2107.03374},
101
+ archivePrefix={arXiv},
102
+ primaryClass={cs.LG}
103
+ }""",
104
+ )
105
+
106
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
107
+ """Returns recommended evaluation metrics for HumanEval."""
108
+ return [
109
+ EvaluationMetric.create(
110
+ name="pass@k",
111
+ type="code",
112
+ description="Probability that correct solution appears at least once in k samples",
113
+ implementation="custom_pass_at_k",
114
+ primary=True,
115
+ ),
116
+ EvaluationMetric.create(
117
+ name="test_success_rate",
118
+ type="code",
119
+ description="Percentage of test cases passed by the generated solution",
120
+ implementation="custom_test_executor",
121
+ primary=False,
122
+ ),
123
+ EvaluationMetric.create(
124
+ name="type_correctness",
125
+ type="code",
126
+ description="Verification of type hints and type safety in generated code",
127
+ implementation="custom_type_checker",
128
+ primary=False,
129
+ ),
130
+ EvaluationMetric.create(
131
+ name="code_style",
132
+ type="code",
133
+ description="Compliance with Python best practices and PEP 8 guidelines",
134
+ implementation="custom_style_checker",
135
+ primary=False,
136
+ ),
137
+ EvaluationMetric.create(
138
+ name="runtime_efficiency",
139
+ type="code",
140
+ description="Analysis of time and space complexity of the solution",
141
+ implementation="custom_complexity_analyzer",
142
+ primary=False,
143
+ ),
144
+ ]
145
+
146
 
147
  class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
148
+ """Parser for the enhanced HumanEval Plus dataset with 80x more comprehensive test coverage."""
149
 
150
  _data_source: ClassVar[str] = "evalplus/humanevalplus"
151
  _default_task: ClassVar[str] = "default"
 
178
  task_name=task, # task is guaranteed to be str from _get_current_task
179
  )
180
 
181
+ def get_dataset_description(self) -> DatasetDescription:
182
+ """Returns description of the HumanEval Plus dataset."""
183
+ return DatasetDescription.create(
184
+ name="HumanEval Plus",
185
+ purpose="Enhanced evaluation of code generation with 80x more test coverage",
186
+ source="EvalPlus",
187
+ language="Python",
188
+ format="Function signatures with docstrings and comprehensive test suites",
189
+ characteristics=(
190
+ "Significantly enhanced version of HumanEval with 80x more test cases. "
191
+ "Includes extensive edge cases, boundary conditions, stress tests, and "
192
+ "error handling scenarios to rigorously evaluate code correctness and robustness. "
193
+ "Each problem has been augmented with comprehensive testing to catch subtle bugs "
194
+ "and ensure production-quality code generation."
195
+ ),
196
+ citation="""@inproceedings{evalplus,
197
+ title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
198
+ author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
199
+ booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
200
+ year = {2023},
201
+ url = {https://openreview.net/forum?id=1qvx610Cu7},
202
+ }""",
203
+ )
204
+
205
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
206
+ """Returns recommended evaluation metrics for HumanEval Plus."""
207
+ return [
208
+ EvaluationMetric.create(
209
+ name="pass@k",
210
+ type="code",
211
+ description="Probability that correct solution appears at least once in k samples",
212
+ implementation="custom_pass_at_k",
213
+ primary=True,
214
+ ),
215
+ EvaluationMetric.create(
216
+ name="test_coverage",
217
+ type="code",
218
+ description="Percentage of edge cases and stress tests passed by the solution",
219
+ implementation="custom_coverage_checker",
220
+ primary=False,
221
+ ),
222
+ EvaluationMetric.create(
223
+ name="error_handling",
224
+ type="code",
225
+ description="Assessment of solution's robustness in handling invalid inputs and edge cases",
226
+ implementation="custom_error_handler",
227
+ primary=False,
228
+ ),
229
+ EvaluationMetric.create(
230
+ name="performance_stress",
231
+ type="code",
232
+ description="Evaluation of solution performance under high load and stress conditions",
233
+ implementation="custom_stress_tester",
234
+ primary=False,
235
+ ),
236
+ EvaluationMetric.create(
237
+ name="code_quality",
238
+ type="code",
239
+ description="Analysis of code readability, maintainability and adherence to Python best practices",
240
+ implementation="custom_quality_checker",
241
+ primary=False,
242
+ ),
243
+ ]
244
+
245
 
246
  if __name__ == "__main__":
247
  # Example usage
tests/test_humaneval_parser.py CHANGED
@@ -171,3 +171,35 @@ def test_plus_get_current_task(plus_parser, plus_sample_entry):
171
  """Test _get_current_task method for HumanEvalDatasetPlusParser"""
172
  task = plus_parser._get_current_task(plus_sample_entry)
173
  assert task == plus_parser._default_task
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  """Test _get_current_task method for HumanEvalDatasetPlusParser"""
172
  task = plus_parser._get_current_task(plus_sample_entry)
173
  assert task == plus_parser._default_task
174
+
175
+
176
+ def test_get_dataset_description(parser, plus_parser):
177
+ """Test dataset description generation for both parsers."""
178
+ # Test original HumanEval description
179
+ description = parser.get_dataset_description()
180
+ assert description.name == "HumanEval"
181
+ assert "code generation" in description.purpose
182
+ assert description.language == "Python"
183
+ assert "Function signatures with docstrings" in description.format
184
+ assert "164 hand-written Python programming problems" in description.characteristics
185
+ assert "chen2021codex" in description.citation
186
+
187
+ # Test HumanEval Plus description
188
+ plus_description = plus_parser.get_dataset_description()
189
+ assert plus_description.name == "HumanEval Plus"
190
+ assert "80x more test coverage" in plus_description.purpose
191
+ assert "comprehensive test suites" in plus_description.format
192
+ assert "edge cases" in plus_description.characteristics
193
+ assert "evalplus" in plus_description.citation
194
+
195
+
196
+ def test_get_evaluation_metrics(parser, plus_parser):
197
+ """Test evaluation metrics generation for both parsers."""
198
+ # Test original HumanEval metrics
199
+ metrics = parser.get_evaluation_metrics()
200
+ assert len(metrics) == 5 # Base metrics + 2 specific metrics
201
+
202
+ # Check primary metrics - update to match actual implementation
203
+ primary_metrics = [m for m in metrics if m.primary]
204
+ assert len(primary_metrics) == 1 # pass@k
205
+ assert any(m.name == "pass@k" for m in primary_metrics)