JeffYang52415 commited on
Commit
fb32f8e
·
unverified ·
1 Parent(s): 6ed7950

feat: add description&metrics for bbh

Browse files
llmdataparser/bbh_parser.py CHANGED
@@ -1,5 +1,5 @@
1
  from dataclasses import dataclass
2
- from typing import Any, ClassVar
3
 
4
  from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
5
  from llmdataparser.prompts import BBH_SYSTEM_PROMPT # You'll need to create this
@@ -87,6 +87,68 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
87
  task_name=task,
88
  )
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  if __name__ == "__main__":
92
  # Example usage
 
1
  from dataclasses import dataclass
2
+ from typing import Any, ClassVar, Dict, List
3
 
4
  from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
5
  from llmdataparser.prompts import BBH_SYSTEM_PROMPT # You'll need to create this
 
87
  task_name=task,
88
  )
89
 
90
+ def get_dataset_description(self) -> Dict[str, str]:
91
+ """Returns a description of the Big Bench Hard dataset."""
92
+ return {
93
+ "name": "Big Bench Hard (BBH)",
94
+ "purpose": "A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
95
+ "source": "https://github.com/suzgunmirac/BIG-Bench-Hard",
96
+ "language": "English",
97
+ "size": "6.5k examples across 27 tasks (23 core + 4 related)",
98
+ "format": "Multiple choice questions with single correct answers",
99
+ "characteristics": (
100
+ "Tasks require complex multi-step reasoning and were selected based on "
101
+ "initial model performance below human baseline. Performance can be "
102
+ "significantly improved through chain-of-thought prompting. The dataset "
103
+ "includes 23 core tasks plus additional related tasks."
104
+ ),
105
+ "model_performance": (
106
+ "With chain-of-thought prompting, PaLM surpassed human performance on "
107
+ "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
108
+ ),
109
+ "citation": (
110
+ "@article{suzgun2022challenging,\n"
111
+ " title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
112
+ ' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
113
+ " journal={arXiv preprint arXiv:2210.09261},\n"
114
+ " year={2022}\n"
115
+ "}"
116
+ ),
117
+ }
118
+
119
+ def get_evaluation_metrics(self) -> List[Dict[str, Any]]:
120
+ """Returns the recommended evaluation metrics for BBH dataset."""
121
+ return [
122
+ {
123
+ "name": "accuracy",
124
+ "type": "classification",
125
+ "description": "Proportion of exactly correct answers (after stripping parentheses)",
126
+ "implementation": "evaluate.load('accuracy')",
127
+ "primary": True,
128
+ },
129
+ {
130
+ "name": "human_eval_delta",
131
+ "type": "comparison",
132
+ "description": "Difference between model accuracy and average human-rater performance baseline",
133
+ "implementation": "custom_human_baseline_comparison",
134
+ "primary": True,
135
+ },
136
+ {
137
+ "name": "per_task_accuracy",
138
+ "type": "classification",
139
+ "description": "Accuracy broken down by individual reasoning tasks",
140
+ "implementation": "custom_task_accuracy",
141
+ "primary": False,
142
+ },
143
+ {
144
+ "name": "exact_match",
145
+ "type": "string_match",
146
+ "description": "Strict exact match between predicted and target answers",
147
+ "implementation": "evaluate.load('exact_match')",
148
+ "primary": False,
149
+ },
150
+ ]
151
+
152
 
153
  if __name__ == "__main__":
154
  # Example usage
tests/test_bbh_parser.py CHANGED
@@ -158,3 +158,76 @@ def test_different_tasks_parsing(bbh_parser, task_name):
158
  assert len(parsed_data) > 0
159
  assert all(entry.task_name == task_name for entry in parsed_data)
160
  assert all(isinstance(entry.answer, str) for entry in parsed_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  assert len(parsed_data) > 0
159
  assert all(entry.task_name == task_name for entry in parsed_data)
160
  assert all(isinstance(entry.answer, str) for entry in parsed_data)
161
+
162
+
163
+ def test_get_evaluation_metrics(bbh_parser):
164
+ """Test evaluation metrics structure and content."""
165
+ metrics = bbh_parser.get_evaluation_metrics()
166
+
167
+ # Check basic structure
168
+ assert isinstance(metrics, list)
169
+ assert len(metrics) > 0
170
+
171
+ # Check each metric has required fields
172
+ required_fields = ["name", "type", "description", "implementation", "primary"]
173
+ for metric in metrics:
174
+ for field in required_fields:
175
+ assert field in metric, f"Missing field {field} in metric {metric['name']}"
176
+
177
+ # Check field types
178
+ assert isinstance(metric["name"], str)
179
+ assert isinstance(metric["type"], str)
180
+ assert isinstance(metric["description"], str)
181
+ assert isinstance(metric["implementation"], str)
182
+ assert isinstance(metric["primary"], bool)
183
+
184
+ # Check specific metrics exist
185
+ metric_names = {m["name"] for m in metrics}
186
+ expected_metrics = {
187
+ "accuracy",
188
+ "human_eval_delta",
189
+ "per_task_accuracy",
190
+ "exact_match",
191
+ }
192
+ assert expected_metrics.issubset(metric_names)
193
+
194
+ # Check primary metrics
195
+ primary_metrics = {m["name"] for m in metrics if m["primary"]}
196
+ assert "accuracy" in primary_metrics
197
+ assert "human_eval_delta" in primary_metrics
198
+
199
+
200
+ def test_dataset_description_citation_format(bbh_parser):
201
+ """Test that the citation in dataset description is properly formatted."""
202
+ description = bbh_parser.get_dataset_description()
203
+ citation = description["citation"]
204
+
205
+ # Check citation structure
206
+ assert citation.startswith("@article{")
207
+ assert "title=" in citation
208
+ assert "author=" in citation
209
+ assert "journal=" in citation
210
+ assert "year=" in citation
211
+
212
+ # Check specific author formatting
213
+ assert "Suzgun, Mirac" in citation
214
+ assert "Wei, Jason" in citation
215
+ assert "and Wei, Jason" in citation # Should be last author
216
+ assert "and and" not in citation # No double "and"
217
+
218
+
219
+ def test_evaluation_metrics_implementations(bbh_parser):
220
+ """Test that evaluation metric implementations are properly specified."""
221
+ metrics = bbh_parser.get_evaluation_metrics()
222
+
223
+ for metric in metrics:
224
+ impl = metric["implementation"]
225
+
226
+ if "evaluate.load" in impl:
227
+ # Check standard metric format
228
+ assert impl.startswith("evaluate.load('")
229
+ assert impl.endswith("')")
230
+ elif "custom_" in impl:
231
+ # Check custom metric format
232
+ assert impl.startswith("custom_")
233
+ assert len(impl) > 7 # More than just "custom_"