|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
from transformers import ( |
|
BitsAndBytesConfig, |
|
AutoModelForVision2Seq, |
|
AutoProcessor, |
|
) |
|
from .llama import * |
|
from ..kernels import ( |
|
post_patch_loss_function, |
|
) |
|
from ._utils import __version__ |
|
from peft import LoraConfig, TaskType, get_peft_model |
|
from transformers import set_seed as transformers_set_seed |
|
from unsloth_zoo.peft_utils import ( |
|
get_peft_regex, |
|
SKIP_QUANTIZATION_MODULES, |
|
) |
|
from triton import __version__ as triton_version |
|
|
|
__all__ = [ |
|
"FastBaseVisionModel", |
|
] |
|
|
|
def _wrap_fast_inference(generate, device_type, dtype, model): |
|
|
|
@torch.inference_mode |
|
def _fast_generate(*args, **kwargs): |
|
|
|
|
|
|
|
|
|
kwargs.pop("token_type_ids", None) |
|
|
|
|
|
model_eos_token_id = getattr(model.config, "eos_token_id", None) |
|
if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"): |
|
model_eos_token_id = model_eos_token_id[0] |
|
|
|
kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) |
|
|
|
try: |
|
kwargs["pixel_values"] = kwargs["pixel_values"].to(model.dtype) |
|
except: |
|
pass |
|
|
|
|
|
with torch.autocast(device_type = device_type, dtype = dtype): |
|
output = generate(*args, **kwargs) |
|
pass |
|
return output |
|
pass |
|
return _fast_generate |
|
pass |
|
|
|
|
|
class FastBaseVisionModel: |
|
|
|
@staticmethod |
|
def from_pretrained( |
|
model_name = "unsloth/llama-3-8b-bnb-4bit", |
|
max_seq_length = None, |
|
dtype = None, |
|
load_in_4bit = True, |
|
token = None, |
|
device_map = "sequential", |
|
trust_remote_code = False, |
|
model_types = None, |
|
tokenizer_name = None, |
|
**kwargs, |
|
): |
|
if trust_remote_code: |
|
print( |
|
"Unsloth: WARNING `trust_remote_code` is True.\n"\ |
|
"Are you certain you want to do remote code execution?" |
|
) |
|
pass |
|
if token is None: token = get_token() |
|
SUPPORTS_BFLOAT16 = is_bfloat16_supported() |
|
gpu_stats = torch.cuda.get_device_properties(0) |
|
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) |
|
|
|
statistics = \ |
|
f"==((====))== Unsloth {__version__}: Fast {model_types[0].title()} vision patching. Transformers: {transformers_version}.\n"\ |
|
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\ |
|
f"O^O/ \_/ \\ Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\ |
|
f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ |
|
f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' |
|
print(statistics) |
|
|
|
|
|
old_hf_transfer = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") |
|
if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1": |
|
print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!") |
|
pass |
|
|
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer |
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" |
|
|
|
get_statistics() |
|
|
|
if dtype is None: |
|
dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 |
|
elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16: |
|
logger.warning_once("Device does not support bfloat16. Will change to float16.") |
|
dtype = torch.float16 |
|
|
|
assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32) |
|
|
|
|
|
pre_check = check_nvidia() |
|
|
|
bnb_config = None |
|
if load_in_4bit: |
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit = True, |
|
bnb_4bit_use_double_quant = True, |
|
bnb_4bit_quant_type = "nf4", |
|
bnb_4bit_compute_dtype = dtype, |
|
llm_int8_skip_modules = SKIP_QUANTIZATION_MODULES, |
|
) |
|
pass |
|
|
|
kwargs.pop("attn_implementation", None); |
|
|
|
|
|
if load_in_4bit: kwargs["quantization_config"] = bnb_config |
|
|
|
model = AutoModelForVision2Seq.from_pretrained( |
|
model_name, |
|
device_map = device_map, |
|
torch_dtype = dtype, |
|
|
|
token = token, |
|
trust_remote_code = trust_remote_code, |
|
|
|
**kwargs, |
|
) |
|
|
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer |
|
|
|
post_check = check_nvidia() |
|
|
|
|
|
tokenizer_name = model_name if tokenizer_name is None else tokenizer_name |
|
tokenizer = AutoProcessor.from_pretrained( |
|
tokenizer_name, |
|
padding_side = "right", |
|
token = token, |
|
) |
|
|
|
tokenizer.tokenizer.padding_side = "right" |
|
|
|
model, tokenizer = patch_tokenizer(model, tokenizer) |
|
model = post_patch_loss_function(model) |
|
|
|
|
|
|
|
if False: |
|
name = model.config._name_or_path |
|
if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): |
|
name = name[:len(name) - len("-bnb-4bit")] |
|
model.config.update({"_name_or_path" : name}) |
|
pass |
|
pass |
|
|
|
|
|
if hasattr(model, "config"): |
|
model.config.update({"unsloth_version" : __version__}) |
|
pass |
|
patch_saving_functions(model, vision = True) |
|
patch_saving_functions(tokenizer, vision = True) |
|
|
|
|
|
from transformers.trainer import Trainer |
|
patch_gradient_accumulation_fix(Trainer) |
|
|
|
|
|
tokenizer.padding_side = "left" |
|
tokenizer.tokenizer.padding_side = "left" |
|
internal_model = model |
|
while hasattr(internal_model, "model"): |
|
internal_model._saved_temp_tokenizer = tokenizer |
|
internal_model = internal_model.model |
|
pass |
|
internal_model._saved_temp_tokenizer = tokenizer |
|
|
|
return model, tokenizer |
|
pass |
|
|
|
|
|
@staticmethod |
|
def get_peft_model( |
|
model, |
|
r = 16, |
|
target_modules = None, |
|
lora_alpha = 16, |
|
lora_dropout = 0, |
|
bias = "none", |
|
finetune_vision_layers = True, |
|
finetune_language_layers = True, |
|
finetune_attention_modules = True, |
|
finetune_mlp_modules = True, |
|
layers_to_transform = None, |
|
layers_pattern = None, |
|
use_gradient_checkpointing = True, |
|
random_state = 3407, |
|
max_seq_length = 2048, |
|
use_rslora = False, |
|
modules_to_save = None, |
|
init_lora_weights = True, |
|
loftq_config = {}, |
|
temporary_location = "_unsloth_temporary_saved_buffers", |
|
**kwargs, |
|
): |
|
transformers_set_seed(random_state) |
|
|
|
if type(r) is not int: |
|
raise TypeError(f"Unsloth: Rank of {str(r)} must be an integer.") |
|
if r <= 0: |
|
raise TypeError(f"Unsloth: Rank of {str(r)} must be larger than 0.") |
|
|
|
if isinstance(model, PeftModelForCausalLM): |
|
raise RuntimeError("Unsloth: You already added LoRA adapters to your model!") |
|
|
|
if target_modules == "all-linear": |
|
finetune_vision_layers = True |
|
finetune_language_layers = True |
|
finetune_attention_modules = True |
|
finetune_mlp_modules = True |
|
pass |
|
if target_modules is None: |
|
target_modules = get_peft_regex( |
|
model, |
|
finetune_vision_layers = finetune_vision_layers, |
|
finetune_language_layers = finetune_language_layers, |
|
finetune_attention_modules = finetune_attention_modules, |
|
finetune_mlp_modules = finetune_mlp_modules, |
|
) |
|
else: |
|
assert(type(target_modules) in (list, tuple,)) |
|
pass |
|
|
|
|
|
for _ in range(3): |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
pass |
|
|
|
lora_config = LoraConfig( |
|
r = r, |
|
lora_alpha = lora_alpha, |
|
target_modules = target_modules, |
|
lora_dropout = lora_dropout, |
|
bias = bias, |
|
task_type = TaskType.CAUSAL_LM, |
|
) |
|
model = prepare_model_for_kbit_training( |
|
model, |
|
use_gradient_checkpointing = use_gradient_checkpointing, |
|
) |
|
model = get_peft_model(model, lora_config) |
|
|
|
model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing) |
|
|
|
|
|
for _ in range(3): |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
pass |
|
patch_saving_functions(model, vision = True) |
|
|
|
return model |
|
pass |
|
|
|
|
|
@staticmethod |
|
def patch_peft_model( |
|
model, |
|
use_gradient_checkpointing = True, |
|
): |
|
if not isinstance(model, PeftModelForCausalLM): |
|
raise TypeError( |
|
"Unsloth: Your model needs to call `.get_peft_model` first!" |
|
) |
|
pass |
|
|
|
model = prepare_model_for_kbit_training( |
|
model, |
|
use_gradient_checkpointing = use_gradient_checkpointing, |
|
use_reentrant = True, |
|
) |
|
|
|
from transformers.trainer import Trainer |
|
if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": |
|
raise RuntimeError( |
|
'Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so '\ |
|
'enabling it will require much more work, so we have to prioritize. Please understand!\n'\ |
|
'We do have a separate beta version, which you can contact us about!\n'\ |
|
'Thank you for your understanding and we appreciate it immensely!' |
|
) |
|
pass |
|
patch_saving_functions(model, vision = True) |
|
|
|
|
|
internal_model = model |
|
while hasattr(internal_model, "model"): |
|
if hasattr(internal_model, "_saved_temp_tokenizer"): |
|
internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" |
|
pass |
|
internal_model = internal_model.model |
|
pass |
|
if hasattr(internal_model, "_saved_temp_tokenizer"): |
|
internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" |
|
pass |
|
|
|
|
|
for _ in range(3): |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
pass |
|
return model |
|
pass |
|
|
|
|
|
@staticmethod |
|
def for_inference(model): |
|
model.gradient_checkpointing = False |
|
model.training = False |
|
|
|
for name, module in model.named_modules(): |
|
if hasattr(module, "gradient_checkpointing"): |
|
module.gradient_checkpointing = False |
|
if hasattr(module, "training"): |
|
module.training = False |
|
pass |
|
|
|
dtype = model.config.torch_dtype |
|
if type(dtype) is str: |
|
if dtype == "float16": dtype = torch.float16 |
|
elif dtype == "bfloat16": dtype = torch.bfloat16 |
|
pass |
|
device_type = model.device.type |
|
|
|
|
|
if model.generate.__name__ != "_fast_generate": |
|
model._unwrapped_old_generate = model.generate |
|
model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model) |
|
pass |
|
|
|
|
|
internal_model = model |
|
while hasattr(internal_model, "model"): |
|
if hasattr(internal_model, "_saved_temp_tokenizer"): |
|
internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left" |
|
pass |
|
internal_model = internal_model.model |
|
pass |
|
if hasattr(internal_model, "_saved_temp_tokenizer"): |
|
internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left" |
|
pass |
|
|
|
|
|
if hasattr(model, "get_input_embeddings"): |
|
embeddings = model.get_input_embeddings() |
|
if hasattr(embeddings, "training"): embeddings.training = False |
|
pass |
|
if hasattr(model, "get_output_embeddings"): |
|
embeddings = model.get_output_embeddings() |
|
if hasattr(embeddings, "training"): embeddings.training = False |
|
pass |
|
|
|
return model |
|
pass |
|
|
|
|
|
@staticmethod |
|
def for_training(model, use_gradient_checkpointing = True): |
|
model.gradient_checkpointing = use_gradient_checkpointing |
|
model.training = True |
|
|
|
for name, module in model.named_modules(): |
|
if hasattr(module, "gradient_checkpointing"): |
|
module.gradient_checkpointing = use_gradient_checkpointing |
|
if hasattr(module, "training"): |
|
module.training = True |
|
pass |
|
|
|
|
|
if hasattr(model, "_unwrapped_old_generate"): |
|
model.generate = model._unwrapped_old_generate |
|
del model._unwrapped_old_generate |
|
pass |
|
|
|
|
|
internal_model = model |
|
while hasattr(internal_model, "model"): |
|
if hasattr(internal_model, "_saved_temp_tokenizer"): |
|
internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" |
|
pass |
|
internal_model = internal_model.model |
|
pass |
|
if hasattr(internal_model, "_saved_temp_tokenizer"): |
|
internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" |
|
pass |
|
|
|
|
|
if hasattr(model, "get_input_embeddings"): |
|
embeddings = model.get_input_embeddings() |
|
if hasattr(embeddings, "training"): embeddings.training = True |
|
pass |
|
if hasattr(model, "get_output_embeddings"): |
|
embeddings = model.get_output_embeddings() |
|
if hasattr(embeddings, "training"): embeddings.training = True |
|
pass |
|
|
|
return model |
|
pass |
|
pass |
|
|