File size: 2,763 Bytes
cb9e677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import logging
from dataclasses import dataclass, field

from simple_parsing.helpers import Serializable

logger = logging.getLogger("data")


@dataclass()
class InstructArgs(Serializable):
    shuffle: bool = True

    # For function calling training examples only the last tool call
    # of the assistant message can be used for training. Therefore,
    # we chunk longer function calling conversations into multiple
    # training samples to not loose any data. E.g.:
    # [[
    #   UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
    #   UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2
    # ]]
    # => is chunked into two training samples:
    # [[
    #   UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
    # ],
    # [
    #   UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
    #   UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2
    # ]]
    # NOTE: Only if your data is already pre-chunked should this argument be set to False
    dynamic_chunk_fn_call: bool = True


@dataclass()
class DataArgs(Serializable):
    # The data arguments `data` and `instruct_data` are a string in the format
    # "data_source_dir_1:weight_1,data_source_dir_2:weight_2,...". The weight
    # will be used to sample the data sources. If the sum of the weights is
    # not 1 when concatenating the two arguments `data` and `instruct_data`,
    # it will be normalized. The data sources folders must contain jsonl files.
    # If the value is an empty string, no data will be used for the corresponding
    # data type.
    data: str = (
        ""  # Each line in the jsonl files inside the data source directories must be a dictionary with a "text" key. See Readme for more details. Can be left empty.
    )
    shuffle: bool = False
    instruct_data: str = (
        ""  # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty.
    )
    eval_instruct_data: str = (
        ""  # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty.
    )
    instruct: InstructArgs = field(default_factory=InstructArgs)

    def __post_init__(self) -> None:
        if (
            self.instruct.shuffle is False
            and self.instruct.dynamic_chunk_fn_call is True
        ):
            raise ValueError(
                "Make sure to either enable `data.instruct.shuffle=True` or `data.instruct.dynamic_chunk_fn_call=False`. Dynamic chunking is only possible if data is loaded and shuffled before training."
            )