File size: 5,254 Bytes
289c905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b73a2c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pytest

from llmdataparser.ifeval_parser import IFEvalDatasetParser, IFEvalParseEntry


@pytest.fixture
def sample_ifeval_entries():
    """Create sample IFEval dataset entries for testing."""
    return [
        {
            "key": 1,
            "prompt": "Write a function to calculate factorial.",
            "instruction_id_list": ["math_001", "programming_001"],
            "kwargs": {"difficulty": "medium", "category": "mathematics"},
        },
        {
            "key": 2,
            "prompt": "Explain quantum computing.",
            "instruction_id_list": ["physics_001"],
            "kwargs": {"difficulty": "hard", "category": "physics"},
        },
    ]


@pytest.fixture
def ifeval_parser():
    """Create an IFEval parser instance."""
    return IFEvalDatasetParser()


def test_ifeval_parse_entry_creation_valid():
    """Test valid creation of IFEvalParseEntry."""
    entry = IFEvalParseEntry.create(
        prompt="Test system prompt\n\nTest instruction",
        answer="",  # IFEval doesn't have answers
        raw_question="Test instruction",
        raw_answer="",
        key=1,
        instruction_id_list=["test_001", "test_002"],
        kwargs={"difficulty": "easy"},
        task_name="default",
    )

    assert isinstance(entry, IFEvalParseEntry)
    assert entry.prompt == "Test system prompt\n\nTest instruction"
    assert entry.answer == ""
    assert entry.key == 1
    assert entry.instruction_id_list == ["test_001", "test_002"]
    assert entry.kwargs == {"difficulty": "easy"}
    assert entry.task_name == "default"


def test_process_entry_ifeval(ifeval_parser, sample_ifeval_entries):
    """Test processing entries in IFEval parser."""
    entry = ifeval_parser.process_entry(sample_ifeval_entries[0])

    assert isinstance(entry, IFEvalParseEntry)
    assert entry.key == 1
    assert entry.instruction_id_list == ["math_001", "programming_001"]
    assert entry.kwargs == {"difficulty": "medium", "category": "mathematics"}
    assert entry.raw_question == "Write a function to calculate factorial."
    assert entry.answer == ""  # IFEval doesn't have answers
    assert entry.task_name == "default"


def test_parser_initialization(ifeval_parser):
    """Test initialization of IFEval parser."""
    assert ifeval_parser._data_source == "google/IFEval"
    assert ifeval_parser._default_task == "default"
    assert ifeval_parser.task_names == ["default"]
    assert (
        ifeval_parser.get_huggingface_link
        == "https://huggingface.co/datasets/google/IFEval"
    )


@pytest.mark.integration
def test_load_dataset(ifeval_parser):
    """Test loading the IFEval dataset."""
    ifeval_parser.load(split="train")
    assert ifeval_parser.raw_data is not None
    assert ifeval_parser.split_names == ["train"]
    assert ifeval_parser._current_task == "default"


def test_parser_string_representation(ifeval_parser):
    """Test string representation of IFEval parser."""
    repr_str = str(ifeval_parser)
    assert "IFEvalDatasetParser" in repr_str
    assert "google/IFEval" in repr_str
    assert "not loaded" in repr_str


def test_get_dataset_description(ifeval_parser):
    """Test dataset description generation for IFEval."""
    description = ifeval_parser.get_dataset_description()

    assert description.name == "IFEval"
    assert "verifiable instructions" in description.purpose.lower()
    assert description.source == "Google Research"
    assert description.language == "English (BCP-47 en)"
    assert "verifiable instruction prompts" in description.format.lower()
    assert "500" in description.characteristics
    assert "automated heuristics" in description.characteristics.lower()
    assert "open llm leaderboard" in description.characteristics.lower()
    assert "zhou2023instructionfollowingevaluation" in description.citation


def test_get_evaluation_metrics(ifeval_parser):
    """Test evaluation metrics generation for IFEval."""
    metrics = ifeval_parser.get_evaluation_metrics()

    # Should have 5 metrics total
    assert len(metrics) == 5

    # Check primary metrics
    primary_metrics = [m for m in metrics if m.primary]
    assert len(primary_metrics) == 3

    # Verify specific metrics exist and have correct properties
    metric_names = {m.name for m in metrics}
    assert "format_compliance" in metric_names
    assert "length_constraints" in metric_names
    assert "punctuation_rules" in metric_names
    assert "keyword_usage" in metric_names
    assert "structural_requirements" in metric_names

    # Check specific metric properties
    format_metric = next(m for m in metrics if m.name == "format_compliance")
    assert format_metric.primary is True
    assert "formatting rules" in format_metric.description.lower()
    assert format_metric.type == "text"

    length_metric = next(m for m in metrics if m.name == "length_constraints")
    assert length_metric.primary is True
    assert "word" in length_metric.description.lower()
    assert length_metric.type == "text"

    punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
    assert punctuation_metric.primary is True
    assert "punctuation" in punctuation_metric.description.lower()
    assert punctuation_metric.type == "text"