CHEMISTral7Bv0.3 / finetune /
Clemspace's picture
Initial model upload
import json
import logging
import shutil
from pathlib import Path
from typing import Dict, List, Optional, Union
import safetensors.torch
import torch
from mistral_common.tokens.tokenizers.sentencepiece import InstructTokenizerBase
from torch.distributed import barrier
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
from model.transformer import LoRALinear
from .distributed import get_rank, get_world_size
from .utils import TrainState
logger = logging.getLogger("checkpointing")
def main_logger_info(message: str) -> None:
if get_rank() == 0:
class Checkpointer:
"""A class to save PyTorch model and optimizer states"""
def __init__(
model: FullyShardedDataParallel,
state: TrainState,
run_dir: Union[Path, str],
optimizer: Optional[torch.optim.Optimizer] = None,
num_ckpt_keep: Optional[int] = None,
self.model = model
self.optimizer = optimizer
self.state = state
self.run_dir = Path(run_dir)
self.rank = get_rank()
self.num_ckpt_keep = num_ckpt_keep
def ckpt_dir(self) -> Path:
return self.run_dir / "checkpoints"
def dst_dir(self) -> Path:
return self.ckpt_dir / f"checkpoint_{self.state.step:06d}" / "consolidated"
def consolidated_path(
ckpt_dir: Path, use_safetensors: bool, save_only_lora: Optional[bool] = False
) -> Path:
suffix = "safetensors" if use_safetensors else "00.pth"
prefix = "lora" if save_only_lora else "consolidated"
return ckpt_dir / f"{prefix}.{suffix}"
def _tmp(ckpt_dir: Path) -> Path:
return ckpt_dir.with_name(f"tmp.{}")
def write_params_info(self, tmp_dst: Path):
params_path = tmp_dst / "params.json"
with open(params_path, "w") as f:
model_args = self.model.args.to_dict()
f.write(json.dumps(model_args, indent=4))
def delete_old_ckpts(self) -> List[Path]:
all_saved_ckpts = [d for d in self.ckpt_dir.iterdir() if d.is_dir()]
# Sort directories by creation time (oldest to newest)
all_saved_ckpts.sort(key=lambda x: x.stat().st_ctime, reverse=True)
ckpts_to_delete = all_saved_ckpts[self.num_ckpt_keep :]
for ckpt_to_delete in ckpts_to_delete:
main_logger_info(f"Deleted ckpt: {ckpt_to_delete}")
except OSError as e:
main_logger_info(f"Error deleting directory {ckpt_to_delete}: {e}")
return ckpts_to_delete
def get_lora_states(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
return {k: v for k, v in state_dict.items() if "lora" in k}
def get_non_lora_states(
state_dict: Dict[str, torch.Tensor]
) -> Dict[str, torch.Tensor]:
return {
k: v
for k, v in state_dict.items()
if not any(l_key in k for l_key in ["lora", "frozen"])
def retrieve_save_states(
self, save_only_lora: bool, save_dtype: torch.dtype
) -> Dict[str, torch.Tensor]:
if save_only_lora:
assert (
), "Cannot save LoRA checkpoint as LoRA training is not enabled."
# remove all potential hooks
for module in self.model.modules():
if isinstance(module, LoRALinear) and hasattr(module, "_merge_lora_handle"):
module._merge_lora_handle.remove() # type: ignore
# merge weights if we don't just save LoRA
if not save_only_lora:
def merge_lora(
m: torch.nn.Module,
destination: Dict[str, torch.Tensor],
prefix: str,
weight = m.merge_weight() # type: ignore
destination[prefix + "weight"] = weight
for module in self.model.modules():
if isinstance(module, LoRALinear):
module._merge_lora_handle = module._register_state_dict_hook(
offload_to_cpu = get_world_size() > 1
if save_only_lora:
def is_trainable_fsdp(
module: Union[torch.nn.Module, FullyShardedDataParallel]
is_fsdp = isinstance(module, FullyShardedDataParallel)
all_params_have_grads = is_fsdp and all(
p.requires_grad is True for p in module.parameters()
# need to make sure only lowest fsdp wrap is used
is_leaf_node = is_fsdp and len(list(module.module.children())) == 0 # type: ignore
return is_fsdp and all_params_have_grads and is_leaf_node
# extract all modules with only trainable weights
modules = {
k: m for k, m in self.model.named_modules() if is_trainable_fsdp(m)
states = {}
for key, module in modules.items():
assert isinstance(
module, FullyShardedDataParallel
), "`module` should be an instance of `FullyShardedDataParallel`"
parent_prefix = key.replace("_fsdp_wrapped_module.", "").replace(
"_checkpoint_wrapped_module.", ""
with module.summon_full_params(
module, writeback=True, offload_to_cpu=offload_to_cpu
for k, v in module.state_dict().items()
# make sure you have enough CPU RAM available to save the full model
assert isinstance(
self.model, FullyShardedDataParallel
), "`self.model` should be an instance of `FullyShardedDataParallel`"
with self.model.summon_full_params(
self.model, writeback=True, offload_to_cpu=offload_to_cpu
states = self.get_non_lora_states(self.model.state_dict())
states = {k: for k, v in states.items()}
states = dict(sorted(states.items()))
return states
def save_tokenizer(instruct_tokenizer: InstructTokenizerBase, tmp_dst: Path):
serialized_spm = instruct_tokenizer.tokenizer._model.serialized_model_proto() # type: ignore
tokenizer_path = tmp_dst / "tokenizer.model.v3"
with open(tokenizer_path, "wb") as f:
def save_checkpoint(
save_only_lora: bool,
dtype: torch.dtype = torch.float16,
instruct_tokenizer: Optional[InstructTokenizerBase] = None,
tmp_dst = self._tmp(self.dst_dir)
f"Dumping checkpoint in {self.dst_dir} using tmp name: {}"
assert not self.dst_dir.exists(), f"dst exists {self.dst_dir}"
tmp_dst.mkdir(parents=True, exist_ok=True)
states: Dict[str, torch.Tensor] = self.retrieve_save_states(
save_only_lora, dtype
if self.rank == 0:
# save checkpoint in tmp path
tmp_dst, use_safetensors=True, save_only_lora=save_only_lora
), # always use safetensors for checkpointing
# save tokenizer
if instruct_tokenizer is not None:
self.save_tokenizer(instruct_tokenizer, tmp_dst)
assert not self.dst_dir.exists(), f"should not happen! {self.dst_dir}"
f"Done dumping checkpoint in {self.dst_dir} for step: {self.state.step}"
# delete last n checkpoints
if self.num_ckpt_keep is not None:
ckpts_to_delete = self.delete_old_ckpts()
f"Done deleting checkpoints {', '.join([str(c) for c in ckpts_to_delete])}"