import logging | |
from dataclasses import dataclass, field | |
from simple_parsing.helpers import Serializable | |
logger = logging.getLogger("data") | |
class InstructArgs(Serializable): | |
shuffle: bool = True | |
# For function calling training examples only the last tool call | |
# of the assistant message can be used for training. Therefore, | |
# we chunk longer function calling conversations into multiple | |
# training samples to not loose any data. E.g.: | |
# [[ | |
# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1 | |
# UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2 | |
# ]] | |
# => is chunked into two training samples: | |
# [[ | |
# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1 | |
# ], | |
# [ | |
# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1 | |
# UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2 | |
# ]] | |
# NOTE: Only if your data is already pre-chunked should this argument be set to False | |
dynamic_chunk_fn_call: bool = True | |
class DataArgs(Serializable): | |
# The data arguments `data` and `instruct_data` are a string in the format | |
# "data_source_dir_1:weight_1,data_source_dir_2:weight_2,...". The weight | |
# will be used to sample the data sources. If the sum of the weights is | |
# not 1 when concatenating the two arguments `data` and `instruct_data`, | |
# it will be normalized. The data sources folders must contain jsonl files. | |
# If the value is an empty string, no data will be used for the corresponding | |
# data type. | |
data: str = ( | |
"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "text" key. See Readme for more details. Can be left empty. | |
) | |
shuffle: bool = False | |
instruct_data: str = ( | |
"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty. | |
) | |
eval_instruct_data: str = ( | |
"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty. | |
) | |
instruct: InstructArgs = field(default_factory=InstructArgs) | |
def __post_init__(self) -> None: | |
if ( | |
self.instruct.shuffle is False | |
and self.instruct.dynamic_chunk_fn_call is True | |
): | |
raise ValueError( | |
"Make sure to either enable `data.instruct.shuffle=True` or `data.instruct.dynamic_chunk_fn_call=False`. Dynamic chunking is only possible if data is loaded and shuffled before training." | |
) | |