File size: 5,254 Bytes
289c905 b73a2c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import pytest
from llmdataparser.ifeval_parser import IFEvalDatasetParser, IFEvalParseEntry
@pytest.fixture
def sample_ifeval_entries():
"""Create sample IFEval dataset entries for testing."""
return [
{
"key": 1,
"prompt": "Write a function to calculate factorial.",
"instruction_id_list": ["math_001", "programming_001"],
"kwargs": {"difficulty": "medium", "category": "mathematics"},
},
{
"key": 2,
"prompt": "Explain quantum computing.",
"instruction_id_list": ["physics_001"],
"kwargs": {"difficulty": "hard", "category": "physics"},
},
]
@pytest.fixture
def ifeval_parser():
"""Create an IFEval parser instance."""
return IFEvalDatasetParser()
def test_ifeval_parse_entry_creation_valid():
"""Test valid creation of IFEvalParseEntry."""
entry = IFEvalParseEntry.create(
prompt="Test system prompt\n\nTest instruction",
answer="", # IFEval doesn't have answers
raw_question="Test instruction",
raw_answer="",
key=1,
instruction_id_list=["test_001", "test_002"],
kwargs={"difficulty": "easy"},
task_name="default",
)
assert isinstance(entry, IFEvalParseEntry)
assert entry.prompt == "Test system prompt\n\nTest instruction"
assert entry.answer == ""
assert entry.key == 1
assert entry.instruction_id_list == ["test_001", "test_002"]
assert entry.kwargs == {"difficulty": "easy"}
assert entry.task_name == "default"
def test_process_entry_ifeval(ifeval_parser, sample_ifeval_entries):
"""Test processing entries in IFEval parser."""
entry = ifeval_parser.process_entry(sample_ifeval_entries[0])
assert isinstance(entry, IFEvalParseEntry)
assert entry.key == 1
assert entry.instruction_id_list == ["math_001", "programming_001"]
assert entry.kwargs == {"difficulty": "medium", "category": "mathematics"}
assert entry.raw_question == "Write a function to calculate factorial."
assert entry.answer == "" # IFEval doesn't have answers
assert entry.task_name == "default"
def test_parser_initialization(ifeval_parser):
"""Test initialization of IFEval parser."""
assert ifeval_parser._data_source == "google/IFEval"
assert ifeval_parser._default_task == "default"
assert ifeval_parser.task_names == ["default"]
assert (
ifeval_parser.get_huggingface_link
== "https://huggingface.co/datasets/google/IFEval"
)
@pytest.mark.integration
def test_load_dataset(ifeval_parser):
"""Test loading the IFEval dataset."""
ifeval_parser.load(split="train")
assert ifeval_parser.raw_data is not None
assert ifeval_parser.split_names == ["train"]
assert ifeval_parser._current_task == "default"
def test_parser_string_representation(ifeval_parser):
"""Test string representation of IFEval parser."""
repr_str = str(ifeval_parser)
assert "IFEvalDatasetParser" in repr_str
assert "google/IFEval" in repr_str
assert "not loaded" in repr_str
def test_get_dataset_description(ifeval_parser):
"""Test dataset description generation for IFEval."""
description = ifeval_parser.get_dataset_description()
assert description.name == "IFEval"
assert "verifiable instructions" in description.purpose.lower()
assert description.source == "Google Research"
assert description.language == "English (BCP-47 en)"
assert "verifiable instruction prompts" in description.format.lower()
assert "500" in description.characteristics
assert "automated heuristics" in description.characteristics.lower()
assert "open llm leaderboard" in description.characteristics.lower()
assert "zhou2023instructionfollowingevaluation" in description.citation
def test_get_evaluation_metrics(ifeval_parser):
"""Test evaluation metrics generation for IFEval."""
metrics = ifeval_parser.get_evaluation_metrics()
# Should have 5 metrics total
assert len(metrics) == 5
# Check primary metrics
primary_metrics = [m for m in metrics if m.primary]
assert len(primary_metrics) == 3
# Verify specific metrics exist and have correct properties
metric_names = {m.name for m in metrics}
assert "format_compliance" in metric_names
assert "length_constraints" in metric_names
assert "punctuation_rules" in metric_names
assert "keyword_usage" in metric_names
assert "structural_requirements" in metric_names
# Check specific metric properties
format_metric = next(m for m in metrics if m.name == "format_compliance")
assert format_metric.primary is True
assert "formatting rules" in format_metric.description.lower()
assert format_metric.type == "text"
length_metric = next(m for m in metrics if m.name == "length_constraints")
assert length_metric.primary is True
assert "word" in length_metric.description.lower()
assert length_metric.type == "text"
punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
assert punctuation_metric.primary is True
assert "punctuation" in punctuation_metric.description.lower()
assert punctuation_metric.type == "text"
|