Clemspace's picture
Initial model upload
cb9e677
raw
history blame
4.16 kB
import logging
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
from simple_parsing.helpers import Serializable
from model.args import LoraArgs
from .data.args import DataArgs
@dataclass
class OptimArgs(Serializable):
lr: float = 3e-4
weight_decay: float = 0.1
pct_start: float = 0.3
@dataclass
class WandbArgs(Serializable):
project: Optional[str] = None # Fill this argument to use wandb.
offline: bool = False
key: Optional[str] = None
run_name: Optional[str] = None
def __post_init__(self) -> None:
if self.project is not None:
try:
import wandb # noqa: F401
except ImportError:
raise ImportError("`wandb` not installed. Either make sure `wandb` is installed or set `wandb:project` to None.")
if len(self.project) == 0:
raise ValueError("`wandb.project` must not be an empty string.")
@dataclass
class MLFlowArgs(Serializable):
tracking_uri: Optional[str] = None
experiment_name: Optional[str] = None
def __post_init__(self) -> None:
if self.tracking_uri is not None:
try:
import mlflow # noqa: F401
except ImportError:
raise ImportError("`mlflow` not installed. Either make sure `mlflow` is installed or set `mlflow.tracking_uri` to None.")
if self.experiment_name is None:
raise ValueError("If `mlflow.tracking_uri` is set, `mlflow.experiment_name` must be set as well.")
@dataclass
class TrainArgs(Serializable):
data: DataArgs
# if specified, instruct_tokenizer and model will be loaded
model_id_or_path: str # Path to the directory containing the initial model or model id: "mistral-small"
run_dir: str # Path to the directory where everything will be saved. It needs to be empty.
# Name of the wandb run, if None it will be set to the name of the run_dir.
optim: OptimArgs = field(default_factory=OptimArgs)
seed: int = 0
# Number of steps to accumulate gradients before calling doing an optimizer step.
num_microbatches: int = 1
seq_len: int = 2048 # Number of tokens per batch per device.
batch_size: int = 1
max_norm: float = 1.0 # Gradient clipping.
max_steps: int = 100 # Number of training steps.
log_freq: int = 1 # Number of steps between each logging.
# Number of steps between each checkpoint saving. If inferior to 1, only the last checkpoint will be saved.
ckpt_freq: int = 0
ckpt_only_lora: bool = True
# If True, no checkpoint will be saved. This is useful for development.
no_ckpt: bool = False
num_ckpt_keep: Optional[int] = 3
eval_freq: int = 0
no_eval: bool = True
# Efficiency
# Determines whether gradient checkpointing should be utilized or not during the training process. Gradient checkpointing can be beneficial in reducing memory usage at the cost of slightly longer training times.
checkpoint: bool = True
world_size: Optional[int] = field(init=False, default=None)
# logging
wandb: WandbArgs = field(default_factory=WandbArgs)
mlflow: MLFlowArgs = field(default_factory=MLFlowArgs)
# LoRA
lora: Optional[LoraArgs] = field(default_factory=LoraArgs)
def __post_init__(self) -> None:
assert getattr(self, "world_size", None) is None
self.world_size = int(os.environ.get("WORLD_SIZE", -1))
if self.wandb.offline:
command = f"cd {self.run_dir}; wandb sync --sync-all"
logging.info(f"to sync wandb offline, run: {command}")
assert self.num_microbatches >= 1
assert self.num_ckpt_keep is None or self.num_ckpt_keep >= 1
if self.model_id_or_path is not None:
Path(self.model_id_or_path).exists()
if not self.ckpt_only_lora:
logging.warning(
"You are have disabled `ckpt_only_lora` and are thus merging the trained LoRA checkpoint into the base model upon checkpointing. This might lead to OOM erros - make sure you have enough CPU and GPU memory."
)