|
[WARNING|2024-11-26 00:32:28] logging.py:162 >> We recommend enable `upcast_layernorm` in quantized training. |
|
|
|
[INFO|2024-11-26 00:32:28] parser.py:355 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16 |
|
|
|
[INFO|2024-11-26 00:32:28] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/config.json |
|
|
|
[INFO|2024-11-26 00:32:28] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "google/gemma-2-9b-it", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 00:32:28] tokenization_utils_base.py:2211 >> loading file tokenizer.model from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/tokenizer.model |
|
|
|
[INFO|2024-11-26 00:32:28] tokenization_utils_base.py:2211 >> loading file tokenizer.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/tokenizer.json |
|
|
|
[INFO|2024-11-26 00:32:28] tokenization_utils_base.py:2211 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2024-11-26 00:32:28] tokenization_utils_base.py:2211 >> loading file special_tokens_map.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/special_tokens_map.json |
|
|
|
[INFO|2024-11-26 00:32:28] tokenization_utils_base.py:2211 >> loading file tokenizer_config.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/tokenizer_config.json |
|
|
|
[INFO|2024-11-26 00:32:30] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/config.json |
|
|
|
[INFO|2024-11-26 00:32:30] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "google/gemma-2-9b-it", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 00:32:30] tokenization_utils_base.py:2211 >> loading file tokenizer.model from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/tokenizer.model |
|
|
|
[INFO|2024-11-26 00:32:30] tokenization_utils_base.py:2211 >> loading file tokenizer.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/tokenizer.json |
|
|
|
[INFO|2024-11-26 00:32:30] tokenization_utils_base.py:2211 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2024-11-26 00:32:30] tokenization_utils_base.py:2211 >> loading file special_tokens_map.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/special_tokens_map.json |
|
|
|
[INFO|2024-11-26 00:32:30] tokenization_utils_base.py:2211 >> loading file tokenizer_config.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/tokenizer_config.json |
|
|
|
[INFO|2024-11-26 00:32:31] logging.py:157 >> Loading dataset treino_pt_rde.json... |
|
|
|
[INFO|2024-11-26 00:32:44] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/config.json |
|
|
|
[INFO|2024-11-26 00:32:44] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "google/gemma-2-9b-it", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[WARNING|2024-11-26 00:32:44] logging.py:162 >> Gemma-2 should use flash attention 2, change `flash_attn` to fa2. |
|
|
|
[INFO|2024-11-26 00:32:44] logging.py:157 >> Quantizing model to 4 bit with bitsandbytes. |
|
|
|
[INFO|2024-11-26 00:32:45] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/27b027bcbb6b1861b02551d5c699d5d07f29610a/config.json |
|
|
|
[INFO|2024-11-26 00:32:45] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "unsloth/gemma-2-9b-it-bnb-4bit", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"quantization_config": { |
|
"_load_in_4bit": true, |
|
"_load_in_8bit": false, |
|
"bnb_4bit_compute_dtype": "bfloat16", |
|
"bnb_4bit_quant_storage": "uint8", |
|
"bnb_4bit_quant_type": "nf4", |
|
"bnb_4bit_use_double_quant": true, |
|
"llm_int8_enable_fp32_cpu_offload": false, |
|
"llm_int8_has_fp16_weight": false, |
|
"llm_int8_skip_modules": null, |
|
"llm_int8_threshold": 6.0, |
|
"load_in_4bit": true, |
|
"load_in_8bit": false, |
|
"quant_method": "bitsandbytes" |
|
}, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"unsloth_version": "2024.9", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 00:32:45] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unslothai--aws/snapshots/66e4c14a24a0b445779c922eef992a4af0694a88/config.json |
|
|
|
[INFO|2024-11-26 00:32:45] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/config.json |
|
|
|
[INFO|2024-11-26 00:32:46] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unslothai--vram-16/snapshots/9703344699da71a2bb9f17e575eb918c8f6cb349/config.json |
|
|
|
[INFO|2024-11-26 00:32:46] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unslothai--1/snapshots/7ec782b7604cd9ea0781c23a4270f031650f5617/config.json |
|
|
|
[INFO|2024-11-26 00:32:46] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/27b027bcbb6b1861b02551d5c699d5d07f29610a/config.json |
|
|
|
[INFO|2024-11-26 00:32:46] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "unsloth/gemma-2-9b-it-bnb-4bit", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"quantization_config": { |
|
"_load_in_4bit": true, |
|
"_load_in_8bit": false, |
|
"bnb_4bit_compute_dtype": "bfloat16", |
|
"bnb_4bit_quant_storage": "uint8", |
|
"bnb_4bit_quant_type": "nf4", |
|
"bnb_4bit_use_double_quant": true, |
|
"llm_int8_enable_fp32_cpu_offload": false, |
|
"llm_int8_has_fp16_weight": false, |
|
"llm_int8_skip_modules": null, |
|
"llm_int8_threshold": 6.0, |
|
"load_in_4bit": true, |
|
"load_in_8bit": false, |
|
"quant_method": "bitsandbytes" |
|
}, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"unsloth_version": "2024.9", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 00:32:46] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/27b027bcbb6b1861b02551d5c699d5d07f29610a/config.json |
|
|
|
[INFO|2024-11-26 00:32:46] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "unsloth/gemma-2-9b-it-bnb-4bit", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"quantization_config": { |
|
"_load_in_4bit": true, |
|
"_load_in_8bit": false, |
|
"bnb_4bit_compute_dtype": "bfloat16", |
|
"bnb_4bit_quant_storage": "uint8", |
|
"bnb_4bit_quant_type": "nf4", |
|
"bnb_4bit_use_double_quant": true, |
|
"llm_int8_enable_fp32_cpu_offload": false, |
|
"llm_int8_has_fp16_weight": false, |
|
"llm_int8_skip_modules": null, |
|
"llm_int8_threshold": 6.0, |
|
"load_in_4bit": true, |
|
"load_in_8bit": false, |
|
"quant_method": "bitsandbytes" |
|
}, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "float16", |
|
"transformers_version": "4.46.1", |
|
"unsloth_version": "2024.9", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 00:45:10] modeling_utils.py:3937 >> loading weights file model.safetensors from cache at /home/zeus/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/27b027bcbb6b1861b02551d5c699d5d07f29610a/model.safetensors |
|
|
|
[INFO|2024-11-26 00:45:10] modeling_utils.py:1670 >> Instantiating Gemma2ForCausalLM model under default dtype torch.float16. |
|
|
|
[INFO|2024-11-26 00:45:10] configuration_utils.py:1096 >> Generate config GenerationConfig { |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"pad_token_id": 0 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 00:45:33] modeling_utils.py:4800 >> All model checkpoint weights were used when initializing Gemma2ForCausalLM. |
|
|
|
|
|
[INFO|2024-11-26 00:45:33] modeling_utils.py:4808 >> All the weights of Gemma2ForCausalLM were initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Gemma2ForCausalLM for predictions without further training. |
|
|
|
[INFO|2024-11-26 00:45:33] configuration_utils.py:1051 >> loading configuration file generation_config.json from cache at /home/zeus/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/27b027bcbb6b1861b02551d5c699d5d07f29610a/generation_config.json |
|
|
|
[INFO|2024-11-26 00:45:33] configuration_utils.py:1096 >> Generate config GenerationConfig { |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"max_length": 8192, |
|
"pad_token_id": 0 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 00:45:40] logging.py:157 >> Gradient checkpointing enabled. |
|
|
|
[INFO|2024-11-26 00:45:40] logging.py:157 >> Upcasting trainable params to float32. |
|
|
|
[INFO|2024-11-26 00:45:40] logging.py:157 >> Fine-tuning method: LoRA |
|
|
|
[INFO|2024-11-26 00:45:40] logging.py:157 >> Found linear modules: k_proj,v_proj,gate_proj,q_proj,down_proj,o_proj,up_proj |
|
|
|
[WARNING|2024-11-26 00:45:42] logging.py:168 >> Unsloth 2024.11.9 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers. |
|
|
|
[INFO|2024-11-26 00:45:44] logging.py:157 >> trainable params: 27,009,024 || all params: 9,268,715,008 || trainable%: 0.2914 |
|
|
|
[INFO|2024-11-26 00:45:44] trainer.py:698 >> Using auto half precision backend |
|
|
|
[WARNING|2024-11-26 00:45:44] <string>:208 >> ==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1 |
|
\\ /| Num examples = 3,716 | Num Epochs = 3 |
|
O^O/ \_/ \ Batch size per device = 8 | Gradient Accumulation steps = 4 |
|
\ / Total batch size = 32 | Total steps = 348 |
|
"-____-" Number of trainable parameters = 27,009,024 |
|
|
|
[INFO|2024-11-26 00:51:16] logging.py:157 >> {'loss': 0.5356, 'learning_rate': 2.9939e-05, 'epoch': 0.09} |
|
|
|
[INFO|2024-11-26 00:56:24] logging.py:157 >> {'loss': 0.1187, 'learning_rate': 2.9756e-05, 'epoch': 0.17} |
|
|
|
[INFO|2024-11-26 01:01:29] logging.py:157 >> {'loss': 0.1068, 'learning_rate': 2.9453e-05, 'epoch': 0.26} |
|
|
|
[INFO|2024-11-26 01:06:35] logging.py:157 >> {'loss': 0.1000, 'learning_rate': 2.9033e-05, 'epoch': 0.34} |
|
|
|
[INFO|2024-11-26 01:11:39] logging.py:157 >> {'loss': 0.0900, 'learning_rate': 2.8498e-05, 'epoch': 0.43} |
|
|
|
[INFO|2024-11-26 01:16:47] logging.py:157 >> {'loss': 0.0831, 'learning_rate': 2.7853e-05, 'epoch': 0.52} |
|
|
|
[INFO|2024-11-26 01:21:57] logging.py:157 >> {'loss': 0.0802, 'learning_rate': 2.7103e-05, 'epoch': 0.60} |
|
|
|
[INFO|2024-11-26 01:27:07] logging.py:157 >> {'loss': 0.0719, 'learning_rate': 2.6255e-05, 'epoch': 0.69} |
|
|
|
[INFO|2024-11-26 01:32:21] logging.py:157 >> {'loss': 0.0744, 'learning_rate': 2.5315e-05, 'epoch': 0.77} |
|
|
|
[INFO|2024-11-26 01:37:33] logging.py:157 >> {'loss': 0.0696, 'learning_rate': 2.4292e-05, 'epoch': 0.86} |
|
|
|
[INFO|2024-11-26 01:42:35] logging.py:157 >> {'loss': 0.0636, 'learning_rate': 2.3192e-05, 'epoch': 0.95} |
|
|
|
[INFO|2024-11-26 01:47:47] logging.py:157 >> {'loss': 0.0716, 'learning_rate': 2.2026e-05, 'epoch': 1.03} |
|
|
|
[INFO|2024-11-26 01:52:57] logging.py:157 >> {'loss': 0.0530, 'learning_rate': 2.0803e-05, 'epoch': 1.12} |
|
|
|
[INFO|2024-11-26 01:58:06] logging.py:157 >> {'loss': 0.0544, 'learning_rate': 1.9532e-05, 'epoch': 1.20} |
|
|
|
[INFO|2024-11-26 02:03:09] logging.py:157 >> {'loss': 0.0570, 'learning_rate': 1.8225e-05, 'epoch': 1.29} |
|
|
|
[INFO|2024-11-26 02:08:17] logging.py:157 >> {'loss': 0.0609, 'learning_rate': 1.6891e-05, 'epoch': 1.38} |
|
|
|
[INFO|2024-11-26 02:13:25] logging.py:157 >> {'loss': 0.0603, 'learning_rate': 1.5542e-05, 'epoch': 1.46} |
|
|
|
[INFO|2024-11-26 02:18:35] logging.py:157 >> {'loss': 0.0574, 'learning_rate': 1.4188e-05, 'epoch': 1.55} |
|
|
|
[INFO|2024-11-26 02:23:39] logging.py:157 >> {'loss': 0.0533, 'learning_rate': 1.2841e-05, 'epoch': 1.63} |
|
|
|
[INFO|2024-11-26 02:28:45] logging.py:157 >> {'loss': 0.0546, 'learning_rate': 1.1511e-05, 'epoch': 1.72} |
|
|
|
[INFO|2024-11-26 02:33:57] logging.py:157 >> {'loss': 0.0555, 'learning_rate': 1.0210e-05, 'epoch': 1.81} |
|
|
|
[INFO|2024-11-26 02:39:06] logging.py:157 >> {'loss': 0.0552, 'learning_rate': 8.9485e-06, 'epoch': 1.89} |
|
|
|
[INFO|2024-11-26 02:44:11] logging.py:157 >> {'loss': 0.0527, 'learning_rate': 7.7358e-06, 'epoch': 1.98} |
|
|
|
[INFO|2024-11-26 02:49:14] logging.py:157 >> {'loss': 0.0476, 'learning_rate': 6.5822e-06, 'epoch': 2.06} |
|
|
|
[INFO|2024-11-26 02:54:22] logging.py:157 >> {'loss': 0.0501, 'learning_rate': 5.4972e-06, 'epoch': 2.15} |
|
|
|
[INFO|2024-11-26 02:59:33] logging.py:157 >> {'loss': 0.0524, 'learning_rate': 4.4896e-06, 'epoch': 2.24} |
|
|
|
[INFO|2024-11-26 03:04:37] logging.py:157 >> {'loss': 0.0384, 'learning_rate': 3.5676e-06, 'epoch': 2.32} |
|
|
|
[INFO|2024-11-26 03:09:52] logging.py:157 >> {'loss': 0.0424, 'learning_rate': 2.7387e-06, 'epoch': 2.41} |
|
|
|
[INFO|2024-11-26 03:14:57] logging.py:157 >> {'loss': 0.0480, 'learning_rate': 2.0096e-06, 'epoch': 2.49} |
|
|
|
[INFO|2024-11-26 03:20:06] logging.py:157 >> {'loss': 0.0402, 'learning_rate': 1.3864e-06, 'epoch': 2.58} |
|
|
|
[INFO|2024-11-26 03:25:13] logging.py:157 >> {'loss': 0.0430, 'learning_rate': 8.7399e-07, 'epoch': 2.67} |
|
|
|
[INFO|2024-11-26 03:30:18] logging.py:157 >> {'loss': 0.0466, 'learning_rate': 4.7666e-07, 'epoch': 2.75} |
|
|
|
[INFO|2024-11-26 03:35:24] logging.py:157 >> {'loss': 0.0474, 'learning_rate': 1.9760e-07, 'epoch': 2.84} |
|
|
|
[INFO|2024-11-26 03:40:30] logging.py:157 >> {'loss': 0.0462, 'learning_rate': 3.9102e-08, 'epoch': 2.92} |
|
|
|
[INFO|2024-11-26 03:44:36] trainer.py:3801 >> Saving model checkpoint to saves/Gemma-2-9B-Instruct/lora/gemma2-9b-finetuned/checkpoint-348 |
|
|
|
[INFO|2024-11-26 03:44:36] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/27b027bcbb6b1861b02551d5c699d5d07f29610a/config.json |
|
|
|
[INFO|2024-11-26 03:44:36] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "unsloth/gemma-2-9b-it", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"quantization_config": { |
|
"_load_in_4bit": true, |
|
"_load_in_8bit": false, |
|
"bnb_4bit_compute_dtype": "bfloat16", |
|
"bnb_4bit_quant_storage": "uint8", |
|
"bnb_4bit_quant_type": "nf4", |
|
"bnb_4bit_use_double_quant": true, |
|
"llm_int8_enable_fp32_cpu_offload": false, |
|
"llm_int8_has_fp16_weight": false, |
|
"llm_int8_skip_modules": null, |
|
"llm_int8_threshold": 6.0, |
|
"load_in_4bit": true, |
|
"load_in_8bit": false, |
|
"quant_method": "bitsandbytes" |
|
}, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"unsloth_version": "2024.9", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[INFO|2024-11-26 03:44:38] <string>:484 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|2024-11-26 03:44:38] trainer.py:3801 >> Saving model checkpoint to saves/Gemma-2-9B-Instruct/lora/gemma2-9b-finetuned |
|
|
|
[INFO|2024-11-26 03:44:38] configuration_utils.py:679 >> loading configuration file config.json from cache at /home/zeus/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/27b027bcbb6b1861b02551d5c699d5d07f29610a/config.json |
|
|
|
[INFO|2024-11-26 03:44:38] configuration_utils.py:746 >> Model config Gemma2Config { |
|
"_name_or_path": "unsloth/gemma-2-9b-it", |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 3584, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 16, |
|
"num_hidden_layers": 42, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"quantization_config": { |
|
"_load_in_4bit": true, |
|
"_load_in_8bit": false, |
|
"bnb_4bit_compute_dtype": "bfloat16", |
|
"bnb_4bit_quant_storage": "uint8", |
|
"bnb_4bit_quant_type": "nf4", |
|
"bnb_4bit_use_double_quant": true, |
|
"llm_int8_enable_fp32_cpu_offload": false, |
|
"llm_int8_has_fp16_weight": false, |
|
"llm_int8_skip_modules": null, |
|
"llm_int8_threshold": 6.0, |
|
"load_in_4bit": true, |
|
"load_in_8bit": false, |
|
"quant_method": "bitsandbytes" |
|
}, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"sliding_window_size": 4096, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"unsloth_version": "2024.9", |
|
"use_cache": true, |
|
"vocab_size": 256000 |
|
} |
|
|
|
|
|
[WARNING|2024-11-26 03:44:38] logging.py:162 >> No metric eval_loss to plot. |
|
|
|
[WARNING|2024-11-26 03:44:38] logging.py:162 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|2024-11-26 03:44:38] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|