# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from unsloth_zoo.utils import Version from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit from peft.tuners.lora import Linear4bit as Peft_Linear4bit from peft.tuners.lora import Linear as Peft_Linear from typing import Optional, Callable, Union, List import torch import os import shutil import pickle import gc from transformers.models.llama.modeling_llama import logger from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters_bias import subprocess import psutil import re from transformers.models.llama.modeling_llama import logger from .tokenizer_utils import fix_sentencepiece_gguf from huggingface_hub import HfApi try: from huggingface_hub.utils import get_token except: # Old HF Hub versions <= 0.0.25 from huggingface_hub.utils._token import get_token pass from pathlib import Path __all__ = [ "print_quantization_methods", "unsloth_save_model", "save_to_gguf", "patch_saving_functions", "create_huggingface_repo", ] # llama.cpp specific targets - all takes 90s. Below takes 60s LLAMA_CPP_TARGETS = ["llama-quantize", "llama-export-lora", "llama-cli",] # Check environments keynames = "\n" + "\n".join(os.environ.keys()) IS_COLAB_ENVIRONMENT = "\nCOLAB_" in keynames IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames KAGGLE_TMP = "/tmp" del keynames # Weights LLAMA_WEIGHTS = ( "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj", "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj", ) LLAMA_LAYERNORMS = ( "input_layernorm", "post_attention_layernorm", "pre_feedforward_layernorm", "post_feedforward_layernorm", ) # https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19 # From https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html ALLOWED_QUANTS = \ { "not_quantized" : "Recommended. Fast conversion. Slow inference, big files.", "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.", "quantized" : "Recommended. Slow conversion. Fast inference, small files.", "f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.", "bf16" : "Bfloat16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.", "f16" : "Float16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.", "q8_0" : "Fast conversion. High resource use, but generally acceptable.", "q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K", "q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K", "q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.", "q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K", "q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K", "q3_k_s" : "Uses Q3_K for all tensors", "q4_0" : "Original quant method, 4-bit.", "q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.", "q4_k_s" : "Uses Q4_K for all tensors", "q4_k" : "alias for q4_k_m", "q5_k" : "alias for q5_k_m", "q5_0" : "Higher accuracy, higher resource usage and slower inference.", "q5_1" : "Even higher accuracy, resource usage and slower inference.", "q5_k_s" : "Uses Q5_K for all tensors", "q6_k" : "Uses Q8_K for all tensors", # "iq2_xxs" : "2.06 bpw quantization", # Not supported sadly # "iq2_xs" : "2.31 bpw quantization", # "iq3_xxs" : "3.06 bpw quantization", "q3_k_xs" : "3-bit extra small quantization", } def print_quantization_methods(): for key, value in ALLOWED_QUANTS.items(): print(f'"{key}" ==> {value}') pass pass def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencepiece_temp"): if not hasattr(model, "_saved_temp_tokenizer"): return False temp_tokenizer = model._saved_temp_tokenizer sentencepiece_model = False file_location = os.path.join(temporary_location, temp_tokenizer.name_or_path) created_folder = False if not os.path.exists(file_location): created_folder = True os.makedirs(file_location) pass temp_tokenizer.save_pretrained(file_location) if os.path.isfile(f"{file_location}/tokenizer.model"): sentencepiece_model = True pass if created_folder: shutil.rmtree(file_location, ignore_errors = True) return sentencepiece_model pass def _free_cached_model(model): from huggingface_hub import scan_cache_dir cached_repos = list(scan_cache_dir().repos) # Go through every cached repo, and delete the one that matches the model we want to save. # Can save 4GB of disk space - useful for Kaggle systems. for cached_repo in cached_repos: if cached_repo.repo_id == model.config._name_or_path: remove_cache_commit = list(cached_repo.revisions)[0].commit_hash delete_strategy = scan_cache_dir().delete_revisions(remove_cache_commit,) logger.warning_once( "Unsloth: Will remove a cached repo with size " + \ delete_strategy.expected_freed_size_str, ) delete_strategy.execute() pass pass pass def _merge_lora(layer, name): bias = getattr(layer, "bias", None) if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)): # Is LoRA so we need to merge! W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer) if quant_state is not None: dtype = quant_state.dtype if type(quant_state) is not list else quant_state[2] W = fast_dequantize(W, quant_state) else: dtype = W.dtype W = W.to(torch.float32).t() # W = W.t() if A is not None: # sAB = (A.t().to(torch.float32) @ (s * B.t().to(torch.float32))) # W += sAB W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s) # W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s) # if not torch.isfinite(W).all(): maximum_element = torch.max(W.min().abs(), W.max()) if not torch.isfinite(maximum_element).item(): raise ValueError(f"Unsloth: Merge failed.\n{name} has some elements = infinity.") pass W = W.t().to(dtype) else: W = layer.weight return W, bias pass def fast_save_pickle(shard, name): # Use this if # CPUs is <= 2 print(f"Unsloth: Saving {name}...") torch.save( shard, name, # HIGHEST_PROTOCOL seems to not work with Pytorch! # pickle_module = pickle, # pickle_protocol = pickle.HIGHEST_PROTOCOL, ) return pass @torch.inference_mode def unsloth_save_model( model, tokenizer, save_directory : Union[str, os.PathLike], save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"] push_to_hub : bool = False, token : Optional[Union[str, bool]] = None, is_main_process : bool = True, state_dict : Optional[dict] = None, save_function : Callable = torch.save, max_shard_size : Union[int, str] = "5GB", safe_serialization : bool = True, variant : Optional[str] = None, save_peft_format : bool = True, # Push to hub use_temp_dir : Optional[bool] = None, commit_message : Optional[str] = "Trained with Unsloth", private : Optional[bool] = None, create_pr : bool = False, revision : str = None, commit_description : str = "Upload model trained with Unsloth 2x faster", tags : List[str] = None, # Our functions temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.9, ): if token is None: token = get_token() if commit_message is None: commit_message = "" if "Unsloth" not in commit_message: commit_message += " (Trained with Unsloth)" commit_message = commit_message.lstrip() if commit_description is None: commit_description = "Upload model trained with Unsloth 2x faster" elif "Unsloth 2x faster" not in commit_description: commit_description += " (Trained with Unsloth 2x faster)" pass if save_method == "merged_4bit": raise RuntimeError( "Unsloth: Merging into 4bit will cause your model to lose accuracy if you plan\n"\ "to merge to GGUF or others later on. I suggest you to do this as a final step\n"\ "if you're planning to do multiple saves.\n"\ "If you are certain, change `save_method` to `merged_4bit_forced`." ) elif save_method == "merged_4bit_forced": save_method = "merged_4bit" pass save_pretrained_settings = dict(locals()) for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"): del save_pretrained_settings[deletion] pass # First check for a token! if push_to_hub: from huggingface_hub import whoami try: username = whoami(token = token)["name"] except: raise RuntimeError( "Unsloth: Please supply a token!\n"\ "Go to https://huggingface.co/settings/tokens" ) pass pass assert(maximum_memory_usage > 0 and maximum_memory_usage <= 0.95) # Clean memory up first for _ in range(3): torch.cuda.empty_cache() gc.collect() pass save_method = save_method.lower().replace(" ", "_") if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit": raise RuntimeError( "Unsloth: You must select one of 3 options when saving models:\n"\ '"lora" ==> This is the fastest and easiet. Just saves LoRA modules.\n'\ '"merged_16bit" ==> This merges LoRA weights and saves to float16. Needed for llama.cpp / GGUF.\n'\ '"merged_4bit" ==> This merges LoRA weights and saves to 4bit. Useful for DPO / inference.' ) pass if save_method == "merged_4bit": print("Unsloth: Merging 4bit and LoRA weights to 4bit...") print("This might take 5 minutes...") # Counteract no LoRA adapters! if hasattr(model, "merge_and_unload"): model = model.merge_and_unload() pass print("Done.") pass if tags is not None: assert(isinstance(tags, (list, tuple))) tags = list(tags) + ["unsloth",] else: tags = ["unsloth",] pass save_pretrained_settings["tags"] = tags if ((save_method == "lora") or (save_method == "merged_4bit")) and push_to_hub: if token is None: raise RuntimeError( "Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\ "Go to https://huggingface.co/settings/tokens." ) pass if save_method == "lora": print("Unsloth: Saving LoRA adapters. Please wait...") elif save_method == "merged_4bit": print("Unsloth: Saving 4bit Bitsandbytes model. Please wait...") pass # Update model tag _ = upload_to_huggingface( model, save_directory, token, "finetuned", "trl", file_location = None, old_username = None, private = private, ) getattr(model, "original_push_to_hub", tokenizer.push_to_hub)\ ( repo_id = save_directory, use_temp_dir = use_temp_dir, commit_message = commit_message, private = private, token = token, max_shard_size = max_shard_size, create_pr = create_pr, safe_serialization = safe_serialization, revision = revision, commit_description = commit_description, tags = tags, ) if tokenizer is not None: # Set padding side to left for inference old_padding_side = tokenizer.padding_side tokenizer.padding_side = "left" getattr(tokenizer, "original_push_to_hub", tokenizer.push_to_hub)\ ( repo_id = save_directory, use_temp_dir = use_temp_dir, commit_message = commit_message, private = private, token = token, max_shard_size = max_shard_size, create_pr = create_pr, safe_serialization = safe_serialization, revision = revision, commit_description = commit_description, tags = tags, ) # Revert back padding side tokenizer.padding_side = old_padding_side pass if hasattr(model, "config"): print(f"Saved {save_method} model to https://huggingface.co/" + save_directory) pass return save_directory, None pass # Tokenizer has different saving arguments tokenizer_save_settings = \ { "save_directory" : save_pretrained_settings["save_directory"], "legacy_format" : None, "filename_prefix" : None, "push_to_hub" : save_pretrained_settings["push_to_hub"], "private" : save_pretrained_settings["private"], "token" : save_pretrained_settings["token"], } # Check if PEFT Model or not - if yes, 3 levels. If not 2 levels. from peft import PeftModelForCausalLM if isinstance(model, PeftModelForCausalLM): internal_model = model.model else: internal_model = model pass # Cannot be converted properly! if (save_method == "merged_4bit") or (save_method == "lora") or ( not hasattr(model, "model") or \ not hasattr(internal_model.model, "layers") ): # Do general saving # Edit save_pretrained_settings # [TODO] _create_repo has errors due to **kwargs getting accepted # commit_description does not seem to work? what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \ if save_pretrained_settings["push_to_hub"] is False else \ ("use_temp_dir", "create_pr", "revision", "tags", "commit_description",) for deletion in what_to_delete: del save_pretrained_settings[deletion] pass if hasattr(model, "add_model_tags"): model.add_model_tags(["unsloth",]) # Update model tag if push_to_hub: _ = upload_to_huggingface( model, save_pretrained_settings["save_directory"], token, "finetuned", "trl", file_location = None, old_username = None, private = private, ) pass if tokenizer is not None: print("Unsloth: Saving tokenizer...", end = "") # Set padding side to left for inference old_padding_side = tokenizer.padding_side tokenizer.padding_side = "left" tokenizer.save_pretrained(**tokenizer_save_settings) # Revert back padding side tokenizer.padding_side = old_padding_side print(" Done.") else: print() print("Unsloth: Saving model...", end = "") if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "") # [TODO] Is this correct? if save_method == "lora": save_pretrained_settings["selected_adapters"] = None pass model.save_pretrained(**save_pretrained_settings) if push_to_hub and hasattr(model, "config"): print("Saved to https://huggingface.co/" + save_pretrained_settings["save_directory"]) pass print(" Done.") return save_directory, None pass # If push_to_hub, we must remove the .../ part of a repo username = None if push_to_hub and "/" in save_directory: # +1 solves absolute path issues new_save_directory = save_directory username = new_save_directory[:new_save_directory.find("/")] new_save_directory = new_save_directory[new_save_directory.find("/")+1:] if IS_KAGGLE_ENVIRONMENT: new_save_directory = os.path.join(KAGGLE_TMP, new_save_directory[new_save_directory.find("/")+1:]) logger.warning_once( "Unsloth: You are pushing to hub in Kaggle environment.\n"\ f"To save memory, we shall move {save_directory} to {new_save_directory}" ) else: logger.warning_once( f"Unsloth: You are pushing to hub, but you passed your HF username = {username}.\n"\ f"We shall truncate {save_directory} to {new_save_directory}" ) save_pretrained_settings["save_directory"] = new_save_directory tokenizer_save_settings ["save_directory"] = new_save_directory save_directory = new_save_directory pass print("Unsloth: Merging 4bit and LoRA weights to 16bit...") # Determine max RAM usage minus sharding max_ram = psutil.virtual_memory().available sharded_ram_usage = 5 * 1024 * 1024 * 1024 if type(max_shard_size) is str: gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE) mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE) if gb_found: sharded_ram_usage = int(gb_found.group(1)) * 1024 * 1024 * 1024 elif mb_found: sharded_ram_usage = int(mb_found.group(1)) * 1024 * 1024 elif type(max_shard_size) is int: sharded_ram_usage = sharded_ram_usage pass # Switch to our fast saving modules if it's a slow PC! n_cpus = psutil.cpu_count(logical = False) if n_cpus is None: n_cpus = psutil.cpu_count() if n_cpus is None: n_cpus = 1 if safe_serialization is None: safe_serialization = True save_pretrained_settings["safe_serialization"] = safe_serialization elif safe_serialization and (n_cpus <= 2): logger.warning_once( f"Unsloth: You have {n_cpus} CPUs. Using `safe_serialization` is 10x slower.\n"\ f"We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.\n"\ f"To force `safe_serialization`, set it to `None` instead.", ) safe_serialization = False save_function = fast_save_pickle save_pretrained_settings["safe_serialization"] = safe_serialization save_pretrained_settings["save_function"] = save_function pass # Only safe_serialization uses more RAM if safe_serialization: max_ram -= sharded_ram_usage else: max_ram -= sharded_ram_usage*0.25 # Uses much less pass max_ram = int(max(0, max_ram) * maximum_memory_usage) print(f"Unsloth: Will use up to "\ f"{round(max_ram/1024/1024/1024, 2)} out of "\ f"{round(psutil.virtual_memory().total/1024/1024/1024, 2)} RAM for saving.") # Move temporary_location to /tmp in Kaggle if IS_KAGGLE_ENVIRONMENT: temporary_location = os.path.join(KAGGLE_TMP, temporary_location) # Max directory for disk saving if not os.path.exists(temporary_location): os.makedirs(temporary_location) pass # Check if Kaggle or Colab, since only 20GB of Disk space allowed. if IS_KAGGLE_ENVIRONMENT or IS_COLAB_ENVIRONMENT: # We free up 4GB of space logger.warning_once( "Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded\n"\ "model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab." ) _free_cached_model(internal_model) pass # HF also uses a OrderedDict from collections import OrderedDict state_dict = OrderedDict() torch_dtype = internal_model.config.torch_dtype if type(torch_dtype) is str: if torch_dtype == "float16": torch_dtype = torch.float16 elif torch_dtype == "bfloat16": torch_dtype = torch.bfloat16 pass # Check modules to save float32 dtype state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data.to(torch_dtype) max_vram = int(torch.cuda.get_device_properties(0).total_memory * maximum_memory_usage) print("Unsloth: Saving model... This might take 5 minutes ...") from tqdm import tqdm as ProgressBar for j, layer in enumerate(ProgressBar(internal_model.model.layers)): for item in LLAMA_WEIGHTS: proj = eval(f"layer.{item}") name = f"model.layers.{j}.{item}.weight" W, bias = _merge_lora(proj, name) # Bias term if bias is not None: state_dict[f"model.layers.{j}.{item}.bias"] = bias pass if (torch.cuda.memory_allocated() + W.nbytes) < max_vram: # Save to GPU memory state_dict[name] = W # [TODO] Saving to RAM seems to leak memory??? # elif (max_ram - W.nbytes) > 0: # # Save to CPU memory # logger.warning_once(f"We will save to RAM and not VRAM now.") # state_dict[name] = W.to("cpu", non_blocking = True, copy = True) # max_ram = max(max_ram - W.nbytes, 0) else: # Save to Disk logger.warning_once("\nWe will save to Disk and not RAM now.") filename = os.path.join(temporary_location, f"{name}.pt") torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,) # weights_only = True weirdly fails? state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True, weights_only = False) pass for item in LLAMA_LAYERNORMS: try: # Skip for Gemma 2 state_dict[f"model.layers.{j}.{item}.weight"] = eval(f"layer.{item}.weight.data") except: continue pass pass state_dict["model.norm.weight"] = internal_model.model.norm.weight.data # Check for modules_to_save float32 dtype # Check for tied weights if internal_model.model.embed_tokens.weight.data_ptr() != internal_model.lm_head.weight.data_ptr(): state_dict["lm_head.weight"] = internal_model.lm_head.weight.data.to(torch_dtype) pass # All tensors MUST be type torch.Tensor and not torch.nn.parameter.Parameter for key, value in state_dict.items(): if hasattr(value, "data"): state_dict[key] = value = value.data if type(value) is not torch.Tensor: logger.warning_once(f"Unsloth: {key} is not a Tensor but a {type(value)}.") pass pass # Edit save_pretrained_settings # [TODO] _create_repo has errors due to **kwargs getting accepted save_pretrained_settings["state_dict"] = state_dict # commit_description does not seem to work? what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \ if not push_to_hub else \ ("use_temp_dir", "create_pr", "revision", "tags", "commit_description",) for deletion in what_to_delete: del save_pretrained_settings[deletion] pass if hasattr(model, "add_model_tags"): model.add_model_tags(["unsloth",]) # Update model tag if push_to_hub: _ = upload_to_huggingface( model, save_pretrained_settings["save_directory"], token, "finetuned", "trl", file_location = None, old_username = username, private = private, ) pass # First check if we're pushing to an organization! save_directory = save_pretrained_settings["save_directory"] if save_pretrained_settings["push_to_hub"]: new_save_directory, new_username = _determine_username(save_directory, username, token) if token is not None: from huggingface_hub import whoami actual_username = whoami(token = token)["name"] else: actual_username = username pass # Check if pushing to an organization if save_pretrained_settings["push_to_hub"] and (username != actual_username): print(f"Unsloth: Saving to organization with address {new_save_directory}") # We upload everything at the end! tokenizer_save_settings["push_to_hub"] = False tokenizer_save_settings["save_directory"] = new_save_directory pass # Save tokenizer if tokenizer is not None: print("Unsloth: Saving tokenizer...", end = "") # Set padding side to left for inference old_padding_side = tokenizer.padding_side tokenizer.padding_side = "left" tokenizer.save_pretrained(**tokenizer_save_settings) # Revert back padding side tokenizer.padding_side = old_padding_side print(" Done.") else: print() pass # Since merged, edit quantization_config old_config = model.config new_config = model.config.to_dict() if "quantization_config" in new_config: del new_config["quantization_config"] original_model = model new_config = type(model.config).from_dict(new_config) while hasattr(original_model, "model"): original_model = original_model.model original_model.config = new_config model.config = new_config # Save! # [TODO] --> is this correct? # save_pretrained_settings["selected_adapters"] = None # Check if pushing to an organization if save_pretrained_settings["push_to_hub"] and (username != actual_username): print(f"Unsloth: Saving to organization with address {new_save_directory}") # Pushing to organization! # Sadly .save_pretrained doesn't work :( # We first save it via .save_pretrained, then upload manually! save_pretrained_settings["save_directory"] = new_save_directory save_pretrained_settings["push_to_hub"] = False internal_model.save_pretrained(**save_pretrained_settings) # Now manually go through each file and upload them manually! filenames = os.listdir(new_save_directory) hf_api = HfApi(token = save_pretrained_settings["token"]) print("Unsloth: Uploading all files... Please wait...") hf_api.upload_folder( folder_path = new_save_directory, path_in_repo = ".", repo_id = new_save_directory, repo_type = "model", commit_message = "(Trained with Unsloth)", ignore_patterns = "*.md", ) else: internal_model.save_pretrained(**save_pretrained_settings) pass # Revert config back original_model = model while hasattr(original_model, "model"): original_model = original_model.model original_model.config = old_config model.config = old_config print("Done.") if push_to_hub and hasattr(model, "config"): print(f"Saved merged model to https://huggingface.co/{username}/{save_directory.lstrip('/').split('/')[-1]}") pass save_pretrained_settings["state_dict"] = None for j, (key, value) in enumerate(state_dict.items()): state_dict[key] = None if j % 10 == 0: torch.cuda.empty_cache() gc.collect() pass pass state_dict = None del state_dict torch.cuda.empty_cache() gc.collect() # Remove temporary location import shutil shutil.rmtree(temporary_location, ignore_errors = True) for _ in range(3): torch.cuda.empty_cache() gc.collect() return save_directory, username pass def install_llama_cpp_clone_non_blocking(): full_command = ["git", "clone", "--recursive", "https://github.com/ggerganov/llama.cpp"] run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) return run_installer pass def install_llama_cpp_make_non_blocking(): # https://github.com/ggerganov/llama.cpp/issues/7062 # Weirdly GPU conversion for GGUF breaks?? # env = { **os.environ, "LLAMA_CUDA": "1", } # Force make clean check = os.system("make clean -C llama.cpp") IS_CMAKE = False if check == 0: # Uses old MAKE n_jobs = max(int(psutil.cpu_count()*1.5), 1) full_command = ["make", "all", "-j"+str(n_jobs), "-C", "llama.cpp"] IS_CMAKE = False else: # Uses new CMAKE n_jobs = max(int(psutil.cpu_count()), 1) # Use less CPUs since 1.5x faster check = os.system("cmake llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=ON") if check != 0: raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp using os.system(...) with error {check}. Please report this ASAP!") pass # f"cmake --build llama.cpp/build --config Release -j{psutil.cpu_count()*2} --clean-first --target {' '.join(LLAMA_CPP_TARGETS)}", full_command = [ "cmake", "--build", "llama.cpp/build", "--config", "Release", "-j"+str(n_jobs), "--clean-first", "--target", ] + LLAMA_CPP_TARGETS IS_CMAKE = True pass # https://github.com/ggerganov/llama.cpp/issues/7062 # Weirdly GPU conversion for GGUF breaks?? # run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) return run_installer, IS_CMAKE pass def install_python_non_blocking(packages = []): full_command = ["pip", "install"] + packages run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) return run_installer pass def try_execute(commands, force_complete = False): for command in commands: with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: for line in sp.stdout: line = line.decode("utf-8", errors = "replace") if "undefined reference" in line: raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp with {line}. Please report this ASAP!") elif "deprecated" in line: return "CMAKE" elif "Unknown argument" in line: raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp with {line}. Please report this ASAP!") elif "***" in line: raise RuntimeError(f"*** Unsloth: Failed compiling llama.cpp with {line}. Please report this ASAP!") print(line, flush = True, end = "") pass if force_complete and sp.returncode is not None and sp.returncode != 0: raise subprocess.CalledProcessError(sp.returncode, sp.args) pass pass return None pass def install_llama_cpp_old(version = -10): # Download the 10th latest release since the latest might be broken! # FALLBACK mechanism releases = subprocess.check_output(["git", "ls-remote", "--tags", "https://github.com/ggerganov/llama.cpp.git"]) releases = releases.decode("utf-8").replace("\t", " ").split("\n") for i, x in enumerate(releases): if "refs/tags/b" not in x: break releases = releases[:i] latest = releases[-1] version = releases[version].split(" ")[0] # Check if the llama.cpp exists if os.path.exists("llama.cpp"): print( "**[WARNING]** You have a llama.cpp directory which is broken.\n"\ "Unsloth will DELETE the broken directory and install a new one.\n"\ "Press CTRL + C / cancel this if this is wrong. We shall wait 30 seconds.\n" ) import time for i in range(30): print(f"**[WARNING]** Deleting llama.cpp directory... {30-i} seconds left.") time.sleep(1) import shutil shutil.rmtree("llama.cpp", ignore_errors = True) pass # Clone a specific commit # Also don't use the GPU! commands = [ "git clone --recursive https://github.com/ggerganov/llama.cpp", f"cd llama.cpp && git reset --hard {version} && git clean -df", ] try_execute(commands) # Try using MAKE commands = [ "make clean -C llama.cpp", f"make all -j{psutil.cpu_count()*2} -C llama.cpp", ] if try_execute(commands) == "CMAKE": # Instead use CMAKE commands = [ "cmake llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=ON", f"cmake --build llama.cpp/build --config Release -j{psutil.cpu_count()*2} --clean-first --target {' '.join(LLAMA_CPP_TARGETS)}", "cp llama.cpp/build/bin/llama-* llama.cpp", "rm -rf llama.cpp/build", ] try_execute(commands) pass # Check if successful if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"): raise RuntimeError( "Unsloth: The file 'llama.cpp/llama-quantize' or `llama.cpp/quantize` does not exist.\n"\ "But we expect this file to exist! Maybe the llama.cpp developers changed the name?" ) pass pass def install_llama_cpp_blocking(use_cuda = False): # https://github.com/ggerganov/llama.cpp/issues/7062 # Weirdly GPU conversion for GGUF breaks?? # use_cuda = "LLAMA_CUDA=1" if use_cuda else "" commands = [ "git clone --recursive https://github.com/ggerganov/llama.cpp", "pip install gguf protobuf", ] if os.path.exists("llama.cpp"): return try_execute(commands) commands = [ "make clean -C llama.cpp", # https://github.com/ggerganov/llama.cpp/issues/7062 # Weirdly GPU conversion for GGUF breaks?? # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp", f"make all -j{psutil.cpu_count()*2} -C llama.cpp", ] if try_execute(commands) == "CMAKE": # Instead use CMAKE commands = [ "cmake llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=ON", f"cmake --build llama.cpp/build --config Release -j{psutil.cpu_count()*2} --clean-first --target {' '.join(LLAMA_CPP_TARGETS)}", "cp llama.cpp/build/bin/llama-* llama.cpp", "rm -rf llama.cpp/build", ] try_execute(commands) pass pass def get_executable(executables): # Get system locations (System Path).split(system separator) system_directories = os.environ.get("PATH").split(os.pathsep) for directory in system_directories: for executable in executables: path = os.path.join(directory, executable) # Check if the executable exists and is executable if os.path.exists(path) and os.access(path, os.X_OK): return path pass pass return None pass def save_to_gguf( model_type : str, model_dtype : str, is_sentencepiece : bool = False, model_directory : str = "unsloth_finetuned_model", quantization_method = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"] first_conversion : str = None, _run_installer = None, # Non blocking install of llama.cpp ): # logger.warning( # "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\ # "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\ # "Please be patient - GGUF saving should still work, but might not work as well." # ) assert(model_dtype == "float16" or model_dtype == "bfloat16") model_dtype = "f16" if model_dtype == "float16" else "bf16" # Convert quantization_method to list if isinstance(quantization_method, list): pass elif isinstance(quantization_method, str): quantization_method = [ quantization_method, ] elif isinstance(quantization_method, tuple): quantization_method = list(quantization_method) else: raise TypeError("Unsloth: quantization_method can only be a string or a list of strings") pass # Check if bfloat16 is supported if model_dtype == "bf16" and not torch.cuda.is_bf16_supported(): logger.warning( "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\ "We shall switch instead to f16." ) model_dtype = "f16" pass # Check first_conversion as well if first_conversion is None: first_conversion = model_dtype pass # Check I quants for quant_method in quantization_method: if quant_method.startswith("iq2"): raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!") pass # Careful convert.py is only for Llama / Mistral based archs use_fast_convert = False if not is_sentencepiece: use_fast_convert = False # Llama-3 elif model_type == "llama": use_fast_convert = True elif model_type == "mistral": use_fast_convert = True pass logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.") # Map quant methods new_quantization_method = [] for quant_method in quantization_method: if quant_method == "not_quantized": quant_method = model_dtype elif quant_method == "fast_quantized": quant_method = "q8_0" elif quant_method == "quantized": quant_method = "q4_k_m" elif quant_method is None: quant_method = "q8_0" # Check if wrong method if quant_method not in ALLOWED_QUANTS.keys(): error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n" for key, value in ALLOWED_QUANTS.items(): error += f"[{key}] => {value}\n" raise RuntimeError(error) pass new_quantization_method.append(quant_method) pass quantization_method = new_quantization_method print_info = \ f"==((====))== Unsloth: Conversion from QLoRA to GGUF information\n"\ f" \\\ /| [0] Installing llama.cpp might take 3 minutes.\n"\ f"O^O/ \_/ \\ [1] Converting HF to GGUF 16bits might take 3 minutes.\n"\ f"\ / [2] Converting GGUF 16bits to {quantization_method} might take 10 minutes each.\n"\ f' "-____-" In total, you will have to wait at least 16 minutes.\n' print(print_info) # Check first_conversion format if first_conversion == "f16" : pass elif first_conversion == "bf16" : pass elif first_conversion == "f32" : pass elif first_conversion == "q8_0" : pass else: raise RuntimeError( f"Unsloth: `first_conversion` can only be one of ['f16', 'bf16', 'f32', 'q8_0'] and not `{first_conversion}`." ) pass # Determine whether the system already has llama.cpp installed and the scripts are executable quantize_location = get_executable(["llama-quantize", "quantize"]) convert_location = get_executable(["convert-hf-to-gguf.py", "convert_hf_to_gguf.py"]) error = 0 if quantize_location is not None and convert_location is not None: print("Unsloth: llama.cpp found in the system. We shall skip installation.") else: print("Unsloth: Installing llama.cpp. This might take 3 minutes...") if _run_installer is not None: _run_installer, IS_CMAKE = _run_installer error = _run_installer.wait() # Check if successful if error != 0: print(f"Unsloth: llama.cpp error code = {error}.") install_llama_cpp_old(-10) pass if IS_CMAKE: # CMAKE needs to do some extra steps print("Unsloth: CMAKE detected. Finalizing some steps for installation.") check = os.system("cp llama.cpp/build/bin/llama-* llama.cpp") if check != 0: raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!") check = os.system("rm -rf llama.cpp/build") if check != 0: raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!") pass else: error = 0 install_llama_cpp_blocking() pass # Careful llama.cpp/quantize changed to llama.cpp/llama-quantize # and llama.cpp/main changed to llama.cpp/llama-cli # See https://github.com/ggerganov/llama.cpp/pull/7809 quantize_location = None if os.path.exists("llama.cpp/quantize"): quantize_location = "llama.cpp/quantize" elif os.path.exists("llama.cpp/llama-quantize"): quantize_location = "llama.cpp/llama-quantize" else: raise RuntimeError( "Unsloth: The file 'llama.cpp/llama-quantize' or 'llama.cpp/quantize' does not exist.\n"\ "But we expect this file to exist! Maybe the llama.cpp developers changed the name?" ) pass # See https://github.com/unslothai/unsloth/pull/730 # Filenames changed again! convert_location = None if os.path.exists("llama.cpp/convert-hf-to-gguf.py"): convert_location = "llama.cpp/convert-hf-to-gguf.py" elif os.path.exists("llama.cpp/convert_hf_to_gguf.py"): convert_location = "llama.cpp/convert_hf_to_gguf.py" else: raise RuntimeError( "Unsloth: The file 'llama.cpp/convert-hf-to-gguf.py' or 'llama.cpp/convert_hf_to_gguf.py' does not exist.\n"\ "But we expect this file to exist! Maybe the llama.cpp developers changed the name?" ) pass pass # Determine maximum first_conversion state if first_conversion == "f32" : strength = 3 elif first_conversion == "f16" : strength = 2 elif first_conversion == "bf16" : strength = 1 elif first_conversion == "q8_0" : strength = 0 for quant_method in quantization_method: if quant_method == "f32": strength = max(strength, 3) elif quant_method == "f16": strength = max(strength, 2) elif quant_method == "bf16": strength = max(strength, 1) elif quant_method == "q8_0": strength = max(strength, 0) else: # Quantized models must have f16 as the default argument if first_conversion == "f32" : pass elif first_conversion == "f16" : pass elif first_conversion == "bf16" : pass elif first_conversion == "q8_0": logger.warning_once( "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\ "but saves disk space!" ) # first_conversion = "f16" pass pass pass # If only q8_0: if len(quantization_method) == 1 and quantization_method[0] == "q8_0": strength = 0 pass if strength >= 3: first_conversion = "f32" elif strength >= 2: first_conversion = "f16" elif strength >= 1: first_conversion = "bf16" else: first_conversion = "q8_0" # Non llama/mistral needs can only use f32 or f16 if not use_fast_convert and \ (first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"): pass # Latest llama.cpp works for all models for q8_0! # logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.") # first_conversion = "f16" pass # Check if bfloat16 is supported if first_conversion == "bf16" and not torch.cuda.is_bf16_supported(): logger.warning( "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\ "We shall switch instead to f16." ) first_conversion = "f16" pass n_cpus = psutil.cpu_count() if n_cpus is None: n_cpus = 1 n_cpus *= 2 # Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model final_location = str((Path(model_directory) / f"unsloth.{first_conversion.upper()}.gguf").absolute()) print(f"Unsloth: [1] Converting model at {model_directory} into {first_conversion} GGUF format.\n"\ f"The output location will be {final_location}\n"\ "This might take 3 minutes...") # We first check if tokenizer.model exists in the model_directory if os.path.exists(f"{model_directory}/tokenizer.model"): vocab_type = "spm,hfft,bpe" # Fix Sentencepiece model as well! fix_sentencepiece_gguf(model_directory) else: vocab_type = "bpe" pass # convert.py is deprecated! use_fast_convert = False if use_fast_convert: command = f"python llama.cpp/convert.py {model_directory} "\ f"--outfile {final_location} --vocab-type {vocab_type} "\ f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab" else: command = f"python {convert_location} {model_directory} "\ f"--outfile {final_location} "\ f"--outtype {first_conversion}" pass try_execute([command,], force_complete = True) # Check if quantization succeeded! if not os.path.isfile(final_location): if IS_KAGGLE_ENVIRONMENT: if not Path(final_location).resolve().is_relative_to(Path('/tmp').resolve()): raise RuntimeError( f"Unsloth: Quantization failed for {final_location}\n"\ "You are in a Kaggle environment, which might be the reason this is failing.\n"\ "Kaggle only provides 20GB of disk space in the working directory.\n"\ "Merging to 16bit for 7b models use 16GB of space.\n"\ "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\ "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\ "You can try saving it to the `/tmp` directory for larger disk space.\n"\ "I suggest you to save the 16bit model first, then use manual llama.cpp conversion." ) else: raise RuntimeError( f"Unsloth: Quantization failed for {final_location}\n"\ "You might have to compile llama.cpp yourself, then run this again.\n"\ "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ "You must run this in the same folder as you're saving your model.\n"\ "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ "cd llama.cpp && make clean && make all -j\n"\ "Once that's done, redo the quantization." ) pass pass print(f"Unsloth: Conversion completed! Output location: {final_location}") full_precision_location = final_location all_saved_locations = [full_precision_location,] # Convert each type! for quant_method in quantization_method: if quant_method != first_conversion: print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This might take 20 minutes...") final_location = str((Path(model_directory) / f"unsloth.{quant_method.upper()}.gguf").absolute()) command = f"./{quantize_location} {full_precision_location} "\ f"{final_location} {quant_method} {n_cpus}" try_execute([command,], force_complete = True) # Check if quantization succeeded! if not os.path.isfile(final_location): if IS_KAGGLE_ENVIRONMENT: if not Path(final_location).resolve().is_relative_to(Path('/tmp').resolve()): raise RuntimeError( f"Unsloth: Quantization failed for {final_location}\n"\ "You are in a Kaggle environment, which might be the reason this is failing.\n"\ "Kaggle only provides 20GB of disk space in the working directory.\n"\ "Merging to 16bit for 7b models use 16GB of space.\n"\ "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\ "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\ "You can try saving it to the `/tmp` directory for larger disk space.\n"\ "I suggest you to save the 16bit model first, then use manual llama.cpp conversion." ) else: raise RuntimeError( "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\ "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ "You must run this in the same folder as you're saving your model.\n"\ "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ "cd llama.cpp && make clean && make all -j\n"\ "Once that's done, redo the quantization." ) pass pass print(f"Unsloth: Conversion completed! Output location: {final_location}") all_saved_locations.append(final_location) pass pass # Finally check if first_conversion (f16, bf16 etc) was in the list of actual quant methods full_precision_seen = first_conversion in frozenset(quantization_method) return all_saved_locations, full_precision_seen pass def unsloth_save_pretrained_merged( self, save_directory : Union[str, os.PathLike], tokenizer = None, save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"] push_to_hub : bool = False, token : Optional[Union[str, bool]] = None, is_main_process : bool = True, state_dict : Optional[dict] = None, save_function : Callable = torch.save, max_shard_size : Union[int, str] = "5GB", safe_serialization : bool = True, variant : Optional[str] = None, save_peft_format : bool = True, tags : List[str] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.75, ): """ Same as .save_pretrained(...) except 4bit weights are auto converted to float16 with as few overhead as possible. Choose for `save_method` to be either: 1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp. 2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference. 3. `lora`: Save LoRA adapters with no merging. Useful for HF inference. """ if tokenizer is None: logger.warning_once( "Unsloth: You're not saving a tokenizer as well?\n"\ "You can do it separately via `tokenizer.save_pretrained(...)`" ) pass arguments = dict(locals()) arguments["model"] = self del arguments["self"] unsloth_save_model(**arguments) for _ in range(3): gc.collect() pass def unsloth_push_to_hub_merged( self, repo_id : str, tokenizer = None, save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"] use_temp_dir : Optional[bool] = None, commit_message : Optional[str] = "Trained with Unsloth", private : Optional[bool] = None, token : Union[bool, str, None] = None, max_shard_size : Union[int, str, None] = "5GB", create_pr : bool = False, safe_serialization : bool = True, revision : str = None, commit_description : str = "Upload model trained with Unsloth 2x faster", tags : Optional[List[str]] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.75, ): """ Same as .push_to_hub(...) except 4bit weights are auto converted to float16 with as few overhead as possible. Choose for `save_method` to be either: 1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp. 2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference. 3. `lora`: Save LoRA adapters with no merging. Useful for HF inference. """ if tokenizer is None: logger.warning_once( "Unsloth: You're not saving a tokenizer as well?\n"\ "You can do it separately via `tokenizer.push_to_hub(...)`" ) pass arguments = dict(locals()) arguments["model"] = self arguments["save_directory"] = repo_id arguments["push_to_hub"] = True del arguments["self"] del arguments["repo_id"] unsloth_save_model(**arguments) for _ in range(3): gc.collect() pass MODEL_CARD = \ """--- base_model: {base_model} tags: - text-generation-inference - transformers - unsloth - {model_type} - {extra} license: apache-2.0 language: - en --- # Uploaded {method} model - **Developed by:** {username} - **License:** apache-2.0 - **Finetuned from model :** {base_model} This {model_type} model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. [](https://github.com/unslothai/unsloth) """ def _determine_username(save_directory, old_username, token): username = "" save_directory = save_directory.lstrip("./") if "/" not in save_directory: from huggingface_hub import whoami try: username = whoami(token = token)["name"] if type(old_username) is str and username != old_username: username = old_username pass save_directory = f"{username}/{save_directory}" except: raise RuntimeError(f"Unsloth: {save_directory} is not a Huggingface directory.") else: username = save_directory.split("/")[0] pass return save_directory, username pass def create_huggingface_repo( model, save_directory, token = None, private = False, ): if token is None : token = get_token() pass save_directory, username = _determine_username(save_directory, "", token) from huggingface_hub import create_repo try: create_repo( repo_id = save_directory, token = token, repo_type = "model", exist_ok = False, private = private, ) # Create model card from huggingface_hub import ModelCard content = MODEL_CARD.format( username = username, base_model = model.config._name_or_path, model_type = model.config.model_type, method = "", extra = "unsloth", ) card = ModelCard(content) card.push_to_hub(save_directory, token = token) except: pass hf_api = HfApi(token = token) return save_directory, hf_api pass def upload_to_huggingface( model, save_directory, token, method, extra = "", file_location = None, old_username = None, private = None, create_config = True, ): save_directory, username = _determine_username(save_directory, old_username, token) from huggingface_hub import create_repo try: create_repo( repo_id = save_directory, token = token, repo_type = "model", exist_ok = False, private = private, ) # Create model card from huggingface_hub import ModelCard content = MODEL_CARD.format( username = username, base_model = model.config._name_or_path, model_type = model.config.model_type, method = "", extra = extra, ) card = ModelCard(content) card.push_to_hub(save_directory, token = token) except: pass if file_location is not None: # Now upload file hf_api = HfApi(token = token) if "/" in file_location: uploaded_location = file_location[file_location.rfind("/")+1:] else: uploaded_location = file_location pass # find ftevent file from tensorboard and upload it import glob ftevent_files = glob.glob("*out.tfevents*", recursive = True) if len(ftevent_files) > 0: print("Unsloth: Uploading tensorboard files... Please wait...", file_location + "*out.tfevents*") for ftevent_file in ftevent_files: hf_api.upload_file( path_or_fileobj = ftevent_file, path_in_repo = ftevent_file.replace(file_location, ""), repo_id = save_directory, repo_type = "model", commit_message = "(Trained with Unsloth)", ) pass pass hf_api.upload_file( path_or_fileobj = file_location, path_in_repo = uploaded_location, repo_id = save_directory, repo_type = "model", commit_message = "(Trained with Unsloth)", ) # We also upload a config.json file if create_config: import json with open("_temporary_unsloth_config.json", "w") as file: json.dump({"model_type" : model.config.model_type}, file, indent = 4) pass hf_api.upload_file( path_or_fileobj = "_temporary_unsloth_config.json", path_in_repo = "config.json", repo_id = save_directory, repo_type = "model", commit_message = "(Trained with Unsloth)", ) os.remove("_temporary_unsloth_config.json") pass pass return username pass def fix_tokenizer_bos_token(tokenizer): # Check if BOS added already, then warn fix_bos_token = False chat_template = getattr(tokenizer, "chat_template", None) if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)): if chat_template is not None and \ ( tokenizer.bos_token in chat_template or \ "{bos_token}" in chat_template.replace(" ", "") or \ "{bos_token+" in chat_template.replace(" ", "") ): fix_bos_token = True logger.warning( "Unsloth: ##### The current model auto adds a BOS token.\n"\ "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily." ) # Remove {{bos_token}} new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template) # Remove {{bos_token + new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template) tokenizer.chat_template = new_chat_template pass pass return fix_bos_token, chat_template pass def create_ollama_modelfile(tokenizer, gguf_location): """ Creates an Ollama Modelfile. Use ollama.create(model = "new_ollama_model", modelfile = modelfile) """ modelfile = getattr(tokenizer, "_ollama_modelfile", None) if modelfile is None: return None FILE_LOCATION_REPLACER = "⚫@✅#🦥__FILE_LOCATION__⚡@🦥#⛵" EOS_TOKEN_REPLACER = "⚫@✅#🦥__EOS_TOKEN__⚡@🦥#⛵" LEFT_BRACKET_REPLACER = "⚫@✅#🦥" RIGHT_BRACKET_REPLACER = "⚡@🦥#⛵" # Fixes https://github.com/unslothai/unsloth/issues/1087 # We must convert all {'s and }'s but keep {__FILE_LOCATION__} intact modelfile = modelfile\ .replace("{__FILE_LOCATION__}", FILE_LOCATION_REPLACER)\ .replace("{__EOS_TOKEN__}", EOS_TOKEN_REPLACER)\ .replace("{", LEFT_BRACKET_REPLACER)\ .replace("}", RIGHT_BRACKET_REPLACER) # Revert {__FILE_LOCATION__} back modelfile = modelfile\ .replace(FILE_LOCATION_REPLACER, "{__FILE_LOCATION__}")\ .replace(EOS_TOKEN_REPLACER, "{__EOS_TOKEN__}") if "__EOS_TOKEN__" in modelfile: modelfile = modelfile.format( __FILE_LOCATION__ = gguf_location, __EOS_TOKEN__ = tokenizer.eos_token, ) else: modelfile = modelfile.format( __FILE_LOCATION__ = gguf_location, ) pass modelfile = modelfile\ .replace("⚫@✅#🦥", "{")\ .replace("⚡@🦥#⛵", "}")\ .rstrip() return modelfile pass def unsloth_save_pretrained_gguf( self, save_directory : Union[str, os.PathLike], tokenizer = None, quantization_method : str = "fast_quantized", first_conversion : str = None, push_to_hub : bool = False, token : Optional[Union[str, bool]] = None, private : Optional[bool] = None, is_main_process : bool = True, state_dict : Optional[dict] = None, save_function : Callable = torch.save, max_shard_size : Union[int, str] = "5GB", safe_serialization : bool = True, variant : Optional[str] = None, save_peft_format : bool = True, tags : List[str] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.85, ): """ Same as .save_pretrained(...) except 4bit weights are auto converted to float16 then converted to GGUF / llama.cpp format. Choose for `quantization_method` to be: "not_quantized" : "Recommended. Fast conversion. Slow inference, big files.", "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.", "quantized" : "Recommended. Slow conversion. Fast inference, small files.", "f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.", "f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.", "q8_0" : "Fast conversion. High resource use, but generally acceptable.", "q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K", "q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K", "q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.", "q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K", "q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K", "q3_k_s" : "Uses Q3_K for all tensors", "q4_0" : "Original quant method, 4-bit.", "q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.", "q4_k_s" : "Uses Q4_K for all tensors", "q4_k" : "alias for q4_k_m", "q5_k" : "alias for q5_k_m", "q5_0" : "Higher accuracy, higher resource usage and slower inference.", "q5_1" : "Even higher accuracy, resource usage and slower inference.", "q5_k_s" : "Uses Q5_K for all tensors", "q6_k" : "Uses Q8_K for all tensors", "iq2_xxs" : "2.06 bpw quantization", "iq2_xs" : "2.31 bpw quantization", "iq3_xxs" : "3.06 bpw quantization", "q3_k_xs" : "3-bit extra small quantization", """ if tokenizer is None: raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.") arguments = dict(locals()) arguments["model"] = self arguments["tokenizer"] = tokenizer arguments["push_to_hub"] = False # We save ourselves arguments["save_method"] = "merged_16bit" # Must be 16bit del arguments["self"] del arguments["quantization_method"] del arguments["first_conversion"] # Fix tokenizer adding an extra BOS token at the front fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer) # Non blocking install GGUF first if not os.path.exists("llama.cpp"): if IS_KAGGLE_ENVIRONMENT: # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: git_clone = install_llama_cpp_clone_non_blocking() python_install = install_python_non_blocking(["gguf", "protobuf"]) git_clone.wait() makefile = install_llama_cpp_make_non_blocking() new_save_directory, old_username = unsloth_save_model(**arguments) python_install.wait() pass else: try: new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None except: # Retry by recloning llama.cpp if IS_KAGGLE_ENVIRONMENT: # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: git_clone = install_llama_cpp_clone_non_blocking() python_install = install_python_non_blocking(["gguf", "protobuf"]) git_clone.wait() makefile = install_llama_cpp_make_non_blocking() new_save_directory, old_username = unsloth_save_model(**arguments) python_install.wait() pass pass pass # Use old chat template if the bos is removed if fix_bos_token: tokenizer.chat_template = old_chat_template pass for _ in range(3): gc.collect() model_dtype = self.config.torch_dtype model_type = self.config.model_type if type(model_dtype) is str: assert(model_dtype == "float16" or model_dtype == "bfloat16") elif model_dtype == torch.float16: model_dtype = "float16" elif model_dtype == torch.bfloat16: model_dtype = "bfloat16" else: raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16") pass is_sentencepiece_model = check_if_sentencepiece_model(self) # Save to GGUF all_file_locations, want_full_precision = save_to_gguf( model_type, model_dtype, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) # Save Ollama modelfile modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0]) modelfile_location = None if modelfile is not None: modelfile_location = os.path.join(new_save_directory, "Modelfile") with open(modelfile_location, "w") as file: file.write(modelfile) pass print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}") pass if fix_bos_token: logger.warning( "Unsloth: ##### The current model auto adds a BOS token.\n"\ "Unsloth: ##### We removed it in GGUF's chat template for you." ) pass if push_to_hub: print("Unsloth: Uploading GGUF to Huggingface Hub...") # If not needing full precision, skip the first if not want_full_precision: all_file_locations = all_file_locations[1:] for file_location in all_file_locations: username = upload_to_huggingface( self, save_directory, token, "GGUF converted", "gguf", file_location, old_username, private, ) link = f"{username}/{new_save_directory.lstrip('/.')}" \ if username not in new_save_directory else \ new_save_directory.lstrip('/.') print(f"Saved GGUF to https://huggingface.co/{link}") pass # Save modelfile if modelfile_location is not None: username = upload_to_huggingface( self, save_directory, token, "GGUF converted", "gguf", modelfile_location, old_username, private, ) print(f"Saved Ollama Modelfile to https://huggingface.co/{link}") pass pass pass def unsloth_push_to_hub_gguf( self, repo_id : str, tokenizer = None, quantization_method : str = "fast_quantized", first_conversion : str = None, use_temp_dir : Optional[bool] = None, commit_message : Optional[str] = "Trained with Unsloth", private : Optional[bool] = None, token : Union[bool, str, None] = None, max_shard_size : Union[int, str, None] = "5GB", create_pr : bool = False, safe_serialization : bool = True, revision : str = None, commit_description : str = "Upload model trained with Unsloth 2x faster", tags : Optional[List[str]] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.85, ): """ Same as .push_to_hub(...) except 4bit weights are auto converted to float16 then converted to GGUF / llama.cpp format. Choose for `quantization_method` to be: "not_quantized" : "Recommended. Fast conversion. Slow inference, big files.", "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.", "quantized" : "Recommended. Slow conversion. Fast inference, small files.", "f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.", "f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.", "q8_0" : "Fast conversion. High resource use, but generally acceptable.", "q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K", "q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K", "q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.", "q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K", "q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K", "q3_k_s" : "Uses Q3_K for all tensors", "q4_0" : "Original quant method, 4-bit.", "q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.", "q4_k_s" : "Uses Q4_K for all tensors", "q5_0" : "Higher accuracy, higher resource usage and slower inference.", "q5_1" : "Even higher accuracy, resource usage and slower inference.", "q5_k_s" : "Uses Q5_K for all tensors", "q6_k" : "Uses Q8_K for all tensors", """ if tokenizer is None: raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.") arguments = dict(locals()) arguments["model"] = self arguments["tokenizer"] = tokenizer arguments["save_directory"] = repo_id arguments["push_to_hub"] = False # We save ourselves arguments["save_method"] = "merged_16bit" # Must be 16bit del arguments["self"] del arguments["repo_id"] del arguments["quantization_method"] del arguments["first_conversion"] # Fix tokenizer adding an extra BOS token at the front fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer) # Non blocking install GGUF first if not os.path.exists("llama.cpp"): if IS_KAGGLE_ENVIRONMENT: # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: git_clone = install_llama_cpp_clone_non_blocking() python_install = install_python_non_blocking(["gguf", "protobuf"]) git_clone.wait() makefile = install_llama_cpp_make_non_blocking() new_save_directory, old_username = unsloth_save_model(**arguments) python_install.wait() pass else: try: new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None except: # Retry by recloning llama.cpp if IS_KAGGLE_ENVIRONMENT: # Kaggle is weird - no blocking installs, and no CUDA? python_install = install_python_non_blocking(["gguf", "protobuf"]) python_install.wait() install_llama_cpp_blocking(use_cuda = False) new_save_directory, old_username = unsloth_save_model(**arguments) makefile = None else: git_clone = install_llama_cpp_clone_non_blocking() python_install = install_python_non_blocking(["gguf", "protobuf"]) git_clone.wait() makefile = install_llama_cpp_make_non_blocking() new_save_directory, old_username = unsloth_save_model(**arguments) python_install.wait() pass pass pass # Use old chat template if the bos is removed if fix_bos_token: tokenizer.chat_template = old_chat_template pass for _ in range(3): gc.collect() model_dtype = self.config.torch_dtype model_type = self.config.model_type if type(model_dtype) is str: assert(model_dtype == "float16" or model_dtype == "bfloat16") elif model_dtype == torch.float16: model_dtype = "float16" elif model_dtype == torch.bfloat16: model_dtype = "bfloat16" else: raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16") pass is_sentencepiece_model = check_if_sentencepiece_model(self) # Save to GGUF all_file_locations, want_full_precision = save_to_gguf( model_type, model_dtype, is_sentencepiece_model, new_save_directory, quantization_method, first_conversion, makefile, ) # Save Ollama modelfile modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0]) modelfile_location = None if modelfile is not None: modelfile_location = os.path.join(new_save_directory, "Modelfile") with open(modelfile_location, "w") as file: file.write(modelfile) pass print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}") pass # If not needing full precision, skip the first if not want_full_precision: all_file_locations = all_file_locations[1:] for file_location in all_file_locations: print("Unsloth: Uploading GGUF to Huggingface Hub...") username = upload_to_huggingface( self, repo_id, token, "GGUF converted", "gguf", file_location, old_username, private, ) link = f"{username}/{new_save_directory.lstrip('/.')}" \ if username not in new_save_directory else \ new_save_directory.lstrip('/.') print(f"Saved GGUF to https://huggingface.co/{link}") pass # Save modelfile if modelfile_location is not None: username = upload_to_huggingface( self, repo_id, token, "GGUF converted", "gguf", modelfile_location, old_username, private, ) print(f"Saved Ollama Modelfile to https://huggingface.co/{link}") pass if fix_bos_token: logger.warning( "Unsloth: ##### The current model auto adds a BOS token.\n"\ "Unsloth: ##### We removed it in GGUF's chat template for you." ) pass pass # Corrected function to save LoRA to a custom directory def save_lora_to_custom_dir(model, tokenizer, save_directory): # Create the custom directory if it doesn't exist os.makedirs(save_directory, exist_ok=True) # Call the unsloth_save_model function with the custom directory unsloth_save_model( model, tokenizer, save_directory=save_directory, save_method="lora", push_to_hub=False, ) # Corrected method within the model class to convert LoRA to GGML and push to Hugging Face Hub def unsloth_convert_lora_to_ggml_and_push_to_hub( self, tokenizer, repo_id: str, use_temp_dir: Optional[bool] = None, commit_message: Optional[str] = "Converted LoRA to GGML with Unsloth", private: Optional[bool] = None, token: Union[bool, str, None] = None, create_pr: bool = False, revision: str = None, commit_description: str = "Convert LoRA to GGML format using Unsloth", temporary_location: str = "_unsloth_temporary_saved_buffers", maximum_memory_usage: float = 0.85, ): if not os.path.exists("llama.cpp"): if IS_KAGGLE_ENVIRONMENT: python_install = install_python_non_blocking(["protobuf"]) python_install.wait() install_llama_cpp_blocking(use_cuda=False) makefile = None else: git_clone = install_llama_cpp_clone_non_blocking() python_install = install_python_non_blocking(["protobuf"]) git_clone.wait() makefile = install_llama_cpp_make_non_blocking() python_install.wait() else: makefile = None for _ in range(3): gc.collect() lora_directory_push = "lora-to-ggml-push" save_lora_to_custom_dir(self, tokenizer, lora_directory_push) model_type = self.config.model_type output_file = os.path.join(lora_directory_push, "ggml-adapter-model.bin") print(f"Unsloth: Converting auto-saved LoRA adapters at {lora_directory_push} to GGML format.") print(f"The output file will be {output_file}") command = f"python3 llama.cpp/convert-lora-to-ggml.py {lora_directory_push} {output_file} llama" try: with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp: for line in sp.stdout: print(line, end="", flush=True) for line in sp.stderr: print(line, end="", flush=True) sp.wait() if sp.returncode != 0: raise subprocess.CalledProcessError(sp.returncode, command) except subprocess.CalledProcessError as e: print(f"Error: Conversion failed with return code {e.returncode}") return print(f"Unsloth: Conversion completed! Output file: {output_file}") print("Unsloth: Uploading GGML file to Hugging Face Hub...") username = upload_to_huggingface( self, repo_id, token, "GGML converted LoRA", "ggml", output_file, None, private, ) link = f"{repo_id.lstrip('/')}" print("Unsloth: Done.") print(f"Converted LoRA to GGML and uploaded to https://huggingface.co/{link}") print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!") def unsloth_convert_lora_to_ggml_and_save_locally( self, save_directory: str, # Added parameter for the folder name tokenizer, temporary_location: str = "_unsloth_temporary_saved_buffers", maximum_memory_usage: float = 0.85, ): if not os.path.exists("llama.cpp"): if IS_KAGGLE_ENVIRONMENT: python_install = install_python_non_blocking(["protobuf"]) python_install.wait() install_llama_cpp_blocking(use_cuda=False) makefile = None else: git_clone = install_llama_cpp_clone_non_blocking() python_install = install_python_non_blocking(["protobuf"]) git_clone.wait() makefile = install_llama_cpp_make_non_blocking() python_install.wait() else: makefile = None for _ in range(3): gc.collect() # Use the provided save_directory for local saving save_lora_to_custom_dir(self, tokenizer, save_directory) model_type = self.config.model_type output_file = os.path.join(save_directory, "ggml-adapter-model.bin") print(f"Unsloth: Converting auto-saved LoRA adapters at {save_directory} to GGML format.") print(f"The output file will be {output_file}") command = f"python3 llama.cpp/convert-lora-to-ggml.py {save_directory} {output_file} llama" try: with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp: for line in sp.stdout: print(line, end="", flush=True) for line in sp.stderr: print(line, end="", flush=True) sp.wait() if sp.returncode != 0: raise subprocess.CalledProcessError(sp.returncode, command) except subprocess.CalledProcessError as e: print(f"Error: Conversion failed with return code {e.returncode}") return print("Unsloth: Done.") print(f"Unsloth: Conversion completed! Output file: {output_file}") print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!") pass from .models.loader_utils import get_model_name from unsloth_zoo.saving_utils import merge_and_overwrite_lora @torch.inference_mode def unsloth_generic_save( model, tokenizer, save_directory : Union[str, os.PathLike] = "unsloth_finetuned_merge", save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"] push_to_hub : bool = False, token : Optional[Union[str, bool]] = None, is_main_process : bool = True, state_dict : Optional[dict] = None, save_function : Callable = torch.save, max_shard_size : Union[int, str] = "5GB", safe_serialization : bool = True, variant : Optional[str] = None, save_peft_format : bool = True, # Push to hub use_temp_dir : Optional[bool] = None, commit_message : Optional[str] = "Trained with Unsloth", private : Optional[bool] = None, create_pr : bool = False, revision : str = None, commit_description : str = "Upload model trained with Unsloth 2x faster", tags : List[str] = None, # Our functions temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.9, ): if token is None and push_to_hub: token = get_token() merge_and_overwrite_lora( get_model_name, model = model, tokenizer = tokenizer, save_directory = save_directory, push_to_hub = push_to_hub, private = private, token = token, output_dtype = None, low_disk_space_usage = False, use_temp_file = False, ) return pass def unsloth_generic_save_pretrained_merged( self, save_directory : Union[str, os.PathLike], tokenizer = None, save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"] push_to_hub : bool = False, token : Optional[Union[str, bool]] = None, is_main_process : bool = True, state_dict : Optional[dict] = None, save_function : Callable = torch.save, max_shard_size : Union[int, str] = "5GB", safe_serialization : bool = True, variant : Optional[str] = None, save_peft_format : bool = True, tags : List[str] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.75, ): """ Same as .push_to_hub(...) except 4bit weights are auto converted to float16 with as few overhead as possible. Choose for `save_method` to be either: 1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp. 2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference. 3. `lora`: Save LoRA adapters with no merging. Useful for HF inference. """ if tokenizer is None: logger.warning_once( "Unsloth: You're not saving a tokenizer as well?\n"\ "You can do it separately via `tokenizer.save_pretrained(...)`" ) pass arguments = dict(locals()) arguments["model"] = self del arguments["self"] unsloth_generic_save(**arguments) for _ in range(3): gc.collect() pass def unsloth_generic_push_to_hub_merged( self, repo_id : str, tokenizer = None, save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"] use_temp_dir : Optional[bool] = None, commit_message : Optional[str] = "Trained with Unsloth", private : Optional[bool] = None, token : Union[bool, str, None] = None, max_shard_size : Union[int, str, None] = "5GB", create_pr : bool = False, safe_serialization : bool = True, revision : str = None, commit_description : str = "Upload model trained with Unsloth 2x faster", tags : Optional[List[str]] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", maximum_memory_usage : float = 0.75, ): """ Same as .push_to_hub(...) except 4bit weights are auto converted to float16 with as few overhead as possible. Choose for `save_method` to be either: 1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp. 2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference. 3. `lora`: Save LoRA adapters with no merging. Useful for HF inference. """ if tokenizer is None: logger.warning_once( "Unsloth: You're not saving a tokenizer as well?\n"\ "You can do it separately via `tokenizer.push_to_hub(...)`" ) pass arguments = dict(locals()) arguments["model"] = self arguments["save_directory"] = repo_id arguments["push_to_hub"] = True del arguments["self"] del arguments["repo_id"] unsloth_generic_save(**arguments) for _ in range(3): gc.collect() pass def not_implemented_save(*args, **kwargs): raise NotImplementedError("Unsloth: Sorry GGUF is currently not supported for vision models!") pass def patch_saving_functions(model, vision = False): import inspect import types from typing import Callable, Optional, Union, List # And now re add our saving methods! if model.push_to_hub.__name__ == "unsloth_push_to_hub": original_push_to_hub = model.original_push_to_hub else: original_push_to_hub = model.push_to_hub pass signature = str(inspect.signature(original_push_to_hub)).replace("NoneType", "None") signature = signature[1:] signature = re.sub("", "torch.save", signature) docs = original_push_to_hub.__doc__.encode("utf-8").decode("utf-8") push_to_hub_text = f'''def unsloth_push_to_hub(self, {signature}: """ {docs} """ arguments = dict(locals()) del arguments["self"] if "tags" in arguments and arguments["tags"] is not None: assert(isinstance(arguments["tags"], (list, tuple))) arguments["tags"] = list(arguments["tags"]) + ["unsloth",] elif "tags" in arguments: arguments["tags"] = ["unsloth",] elif hasattr(self, "add_model_tags"): self.add_model_tags(["unsloth",]) if "commit_message" in arguments: commit_message = arguments["commit_message"] if commit_message is not None: if not commit_message.endswith(" "): commit_message += " " if "Unsloth" not in commit_message: commit_message += "(Trained with Unsloth)" else: commit_message = "Upload model trained with Unsloth" arguments["commit_message"] = commit_message if "commit_description" in arguments: commit_description = arguments["commit_description"] if commit_description is not None: if not commit_description.endswith(" "): commit_description += " " if "Unsloth" not in commit_description: commit_description += "(Trained with Unsloth 2x faster)" else: commit_description = "Upload model trained with Unsloth 2x faster" arguments["commit_description"] = commit_description # Update model tag if hasattr(self, "config"): _ = upload_to_huggingface( self, arguments["repo_id"], arguments["token"], "finetuned", "trl", file_location = None, old_username = None, private = arguments["private"], ) pass try: self.original_push_to_hub(**arguments) except: del arguments["tags"] self.original_push_to_hub(**arguments) pass if hasattr(self, "config"): print("Saved model to https://huggingface.co/" + arguments["repo_id"]) pass ''' exec(push_to_hub_text, globals()) original_model = model while True: if original_model.push_to_hub.__name__ != "unsloth_push_to_hub": original_model.original_push_to_hub = original_model.push_to_hub original_model.push_to_hub = types.MethodType(unsloth_push_to_hub, original_model) if hasattr(original_model, "add_model_tags"): original_model.add_model_tags(["unsloth",]) pass pass if hasattr(original_model, "model"): original_model = original_model.model else: break pass # Add saving methods to top level model if not vision: if hasattr(model, "config"): # Counteract tokenizers model.push_to_hub_merged = types.MethodType(unsloth_push_to_hub_merged, model) model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model) model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model) model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model) model.push_to_hub_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub, model) model.save_pretrained_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model) pass else: # Vision only 1 option model.push_to_hub_merged = types.MethodType(unsloth_generic_push_to_hub_merged, model) model.save_pretrained_merged = types.MethodType(unsloth_generic_save_pretrained_merged, model) model.push_to_hub_gguf = types.MethodType(not_implemented_save, model) model.save_pretrained_gguf = types.MethodType(not_implemented_save, model) pass return model pass