|
[INFO|2025-01-08 22:07:07] parser.py:359 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 |
|
|
|
[INFO|2025-01-08 22:07:07] configuration_utils.py:677 >> loading configuration file /root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa/config.json |
|
|
|
[INFO|2025-01-08 22:07:07] configuration_utils.py:746 >> Model config GraniteConfig { |
|
"_name_or_path": "/root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa", |
|
"architectures": [ |
|
"GraniteForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.1, |
|
"attention_multiplier": 0.015625, |
|
"bos_token_id": 0, |
|
"embedding_multiplier": 12.0, |
|
"eos_token_id": 0, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"logits_scaling": 8.0, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "granite", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"residual_multiplier": 0.22, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"use_cache": false, |
|
"vocab_size": 49216 |
|
} |
|
|
|
|
|
[INFO|2025-01-08 22:07:07] parser.py:359 >> Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file vocab.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file merges.txt |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file tokenizer.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file added_tokens.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file special_tokens_map.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file tokenizer_config.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2475 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2025-01-08 22:07:07] configuration_utils.py:677 >> loading configuration file /root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa/config.json |
|
|
|
[INFO|2025-01-08 22:07:07] configuration_utils.py:746 >> Model config GraniteConfig { |
|
"_name_or_path": "/root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa", |
|
"architectures": [ |
|
"GraniteForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.1, |
|
"attention_multiplier": 0.015625, |
|
"bos_token_id": 0, |
|
"embedding_multiplier": 12.0, |
|
"eos_token_id": 0, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"logits_scaling": 8.0, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "granite", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"residual_multiplier": 0.22, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"use_cache": false, |
|
"vocab_size": 49216 |
|
} |
|
|
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file vocab.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file merges.txt |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file tokenizer.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file added_tokens.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file special_tokens_map.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2209 >> loading file tokenizer_config.json |
|
|
|
[INFO|2025-01-08 22:07:07] tokenization_utils_base.py:2475 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2025-01-08 22:07:07] logging.py:157 >> Add <|end_of_text|> to stop words. |
|
|
|
[WARNING|2025-01-08 22:07:07] logging.py:162 >> New tokens have been added, make sure `resize_vocab` is True. |
|
|
|
[INFO|2025-01-08 22:07:07] logging.py:157 >> Loading dataset /root/LLaMA-Factory/data/bluryar/blur/stage1_sampled_data.jsonl... |
|
|
|
[INFO|2025-01-08 22:07:09] logging.py:157 >> Loading dataset /root/LLaMA-Factory/data/bluryar/blur/stage2/hqfx_function_calling_sft.jsonl... |
|
|
|
[INFO|2025-01-08 22:07:32] configuration_utils.py:677 >> loading configuration file /root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa/config.json |
|
|
|
[INFO|2025-01-08 22:07:32] configuration_utils.py:746 >> Model config GraniteConfig { |
|
"_name_or_path": "/root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa", |
|
"architectures": [ |
|
"GraniteForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.1, |
|
"attention_multiplier": 0.015625, |
|
"bos_token_id": 0, |
|
"embedding_multiplier": 12.0, |
|
"eos_token_id": 0, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"logits_scaling": 8.0, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "granite", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 8, |
|
"pad_token_id": 0, |
|
"residual_multiplier": 0.22, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 5000000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.46.1", |
|
"use_cache": false, |
|
"vocab_size": 49216 |
|
} |
|
|
|
|
|
[INFO|2025-01-08 22:07:32] modeling_utils.py:3934 >> loading weights file /root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa/model.safetensors.index.json |
|
|
|
[INFO|2025-01-08 22:07:32] modeling_utils.py:1670 >> Instantiating GraniteForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|2025-01-08 22:07:32] configuration_utils.py:1096 >> Generate config GenerationConfig { |
|
"bos_token_id": 0, |
|
"eos_token_id": 0, |
|
"pad_token_id": 0, |
|
"use_cache": false |
|
} |
|
|
|
|
|
[INFO|2025-01-08 22:07:34] modeling_utils.py:4800 >> All model checkpoint weights were used when initializing GraniteForCausalLM. |
|
|
|
|
|
[INFO|2025-01-08 22:07:34] modeling_utils.py:4808 >> All the weights of GraniteForCausalLM were initialized from the model checkpoint at /root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use GraniteForCausalLM for predictions without further training. |
|
|
|
[INFO|2025-01-08 22:07:34] configuration_utils.py:1049 >> loading configuration file /root/LLaMA-Factory/models/bluryar/granite-3.1-2b-qa/generation_config.json |
|
|
|
[INFO|2025-01-08 22:07:34] configuration_utils.py:1096 >> Generate config GenerationConfig { |
|
"bos_token_id": 0, |
|
"eos_token_id": 0, |
|
"pad_token_id": 0 |
|
} |
|
|
|
|
|
[INFO|2025-01-08 22:07:34] logging.py:157 >> Gradient checkpointing enabled. |
|
|
|
[INFO|2025-01-08 22:07:34] logging.py:157 >> Using FlashAttention-2 for faster training and inference. |
|
|
|
[INFO|2025-01-08 22:07:34] logging.py:157 >> Upcasting trainable params to float32. |
|
|
|
[INFO|2025-01-08 22:07:34] logging.py:157 >> Fine-tuning method: Full |
|
|
|
[INFO|2025-01-08 22:07:34] logging.py:157 >> trainable params: 2,533,656,576 || all params: 2,533,656,576 || trainable%: 100.0000 |
|
|
|
[INFO|2025-01-08 22:07:34] trainer.py:698 >> Using auto half precision backend |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2313 >> ***** Running training ***** |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2314 >> Num examples = 48,185 |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2315 >> Num Epochs = 1 |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2316 >> Instantaneous batch size per device = 2 |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2319 >> Total train batch size (w. parallel, distributed & accumulation) = 64 |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2320 >> Gradient Accumulation steps = 8 |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2321 >> Total optimization steps = 753 |
|
|
|
[INFO|2025-01-08 22:08:07] trainer.py:2322 >> Number of trainable parameters = 2,533,656,576 |
|
|
|
[INFO|2025-01-08 22:09:31] logging.py:157 >> {'loss': 0.8161, 'learning_rate': 3.3333e-07, 'epoch': 0.01} |
|
|
|
[INFO|2025-01-08 22:10:53] logging.py:157 >> {'loss': 0.8023, 'learning_rate': 6.6667e-07, 'epoch': 0.01} |
|
|
|
[INFO|2025-01-08 22:12:19] logging.py:157 >> {'loss': 0.8767, 'learning_rate': 1.0000e-06, 'epoch': 0.02} |
|
|
|
[INFO|2025-01-08 22:13:48] logging.py:157 >> {'loss': 0.7722, 'learning_rate': 1.3333e-06, 'epoch': 0.03} |
|
|
|
[INFO|2025-01-08 22:15:26] logging.py:157 >> {'loss': 0.6902, 'learning_rate': 1.6667e-06, 'epoch': 0.03} |
|
|
|
[INFO|2025-01-08 22:16:52] logging.py:157 >> {'loss': 0.7115, 'learning_rate': 2.0000e-06, 'epoch': 0.04} |
|
|
|
[INFO|2025-01-08 22:18:21] logging.py:157 >> {'loss': 0.6591, 'learning_rate': 2.3333e-06, 'epoch': 0.05} |
|
|
|
[INFO|2025-01-08 22:19:37] logging.py:157 >> {'loss': 0.6490, 'learning_rate': 2.6667e-06, 'epoch': 0.05} |
|
|
|
[INFO|2025-01-08 22:21:02] logging.py:157 >> {'loss': 0.6547, 'learning_rate': 3.0000e-06, 'epoch': 0.06} |
|
|
|
[INFO|2025-01-08 22:22:30] logging.py:157 >> {'loss': 0.6395, 'learning_rate': 3.3333e-06, 'epoch': 0.07} |
|
|
|
[INFO|2025-01-08 22:23:55] logging.py:157 >> {'loss': 0.6091, 'learning_rate': 3.6667e-06, 'epoch': 0.07} |
|
|
|
[INFO|2025-01-08 22:25:23] logging.py:157 >> {'loss': 0.6551, 'learning_rate': 4.0000e-06, 'epoch': 0.08} |
|
|
|
[INFO|2025-01-08 22:26:46] logging.py:157 >> {'loss': 0.6516, 'learning_rate': 4.3333e-06, 'epoch': 0.09} |
|
|
|
[INFO|2025-01-08 22:27:57] logging.py:157 >> {'loss': 0.6530, 'learning_rate': 4.6667e-06, 'epoch': 0.09} |
|
|
|
[INFO|2025-01-08 22:29:22] logging.py:157 >> {'loss': 0.6698, 'learning_rate': 5.0000e-06, 'epoch': 0.10} |
|
|
|
[INFO|2025-01-08 22:30:49] logging.py:157 >> {'loss': 0.6120, 'learning_rate': 5.3333e-06, 'epoch': 0.11} |
|
|
|
[INFO|2025-01-08 22:32:12] logging.py:157 >> {'loss': 0.6104, 'learning_rate': 5.6667e-06, 'epoch': 0.11} |
|
|
|
[INFO|2025-01-08 22:33:43] logging.py:157 >> {'loss': 0.6090, 'learning_rate': 6.0000e-06, 'epoch': 0.12} |
|
|
|
[INFO|2025-01-08 22:35:04] logging.py:157 >> {'loss': 0.6171, 'learning_rate': 6.3333e-06, 'epoch': 0.13} |
|
|
|
[INFO|2025-01-08 22:36:25] logging.py:157 >> {'loss': 0.6189, 'learning_rate': 6.6667e-06, 'epoch': 0.13} |
|
|
|
[INFO|2025-01-08 22:37:46] logging.py:157 >> {'loss': 0.6014, 'learning_rate': 7.0000e-06, 'epoch': 0.14} |
|
|
|
[INFO|2025-01-08 22:39:11] logging.py:157 >> {'loss': 0.6451, 'learning_rate': 7.3333e-06, 'epoch': 0.15} |
|
|
|
[INFO|2025-01-08 22:40:32] logging.py:157 >> {'loss': 0.5916, 'learning_rate': 7.6667e-06, 'epoch': 0.15} |
|
|
|
[INFO|2025-01-08 22:41:55] logging.py:157 >> {'loss': 0.5904, 'learning_rate': 8.0000e-06, 'epoch': 0.16} |
|
|
|
[INFO|2025-01-08 22:43:33] logging.py:157 >> {'loss': 0.5848, 'learning_rate': 8.3333e-06, 'epoch': 0.17} |
|
|
|
[INFO|2025-01-08 22:44:57] logging.py:157 >> {'loss': 0.6029, 'learning_rate': 8.6667e-06, 'epoch': 0.17} |
|
|
|
[INFO|2025-01-08 22:46:14] logging.py:157 >> {'loss': 0.6011, 'learning_rate': 9.0000e-06, 'epoch': 0.18} |
|
|
|
[INFO|2025-01-08 22:47:39] logging.py:157 >> {'loss': 0.5963, 'learning_rate': 9.3333e-06, 'epoch': 0.19} |
|
|
|
[INFO|2025-01-08 22:48:55] logging.py:157 >> {'loss': 0.6348, 'learning_rate': 9.6667e-06, 'epoch': 0.19} |
|
|
|
[INFO|2025-01-08 22:50:15] logging.py:157 >> {'loss': 0.5913, 'learning_rate': 1.0000e-05, 'epoch': 0.20} |
|
|
|
[INFO|2025-01-08 22:51:43] logging.py:157 >> {'loss': 0.6214, 'learning_rate': 9.9983e-06, 'epoch': 0.21} |
|
|
|
[INFO|2025-01-08 22:53:22] logging.py:157 >> {'loss': 0.6413, 'learning_rate': 9.9932e-06, 'epoch': 0.21} |
|
|
|
[INFO|2025-01-08 22:54:36] logging.py:157 >> {'loss': 0.5808, 'learning_rate': 9.9847e-06, 'epoch': 0.22} |
|
|
|
[INFO|2025-01-08 22:55:58] logging.py:157 >> {'loss': 0.6343, 'learning_rate': 9.9729e-06, 'epoch': 0.23} |
|
|
|
[INFO|2025-01-08 22:57:18] logging.py:157 >> {'loss': 0.6134, 'learning_rate': 9.9576e-06, 'epoch': 0.23} |
|
|
|
[INFO|2025-01-08 22:58:46] logging.py:157 >> {'loss': 0.5888, 'learning_rate': 9.9391e-06, 'epoch': 0.24} |
|
|
|
[INFO|2025-01-08 23:00:18] logging.py:157 >> {'loss': 0.6621, 'learning_rate': 9.9171e-06, 'epoch': 0.25} |
|
|
|
[INFO|2025-01-08 23:01:43] logging.py:157 >> {'loss': 0.6580, 'learning_rate': 9.8918e-06, 'epoch': 0.25} |
|
|
|
[INFO|2025-01-08 23:02:54] logging.py:157 >> {'loss': 0.5960, 'learning_rate': 9.8632e-06, 'epoch': 0.26} |
|
|
|
[INFO|2025-01-08 23:04:15] logging.py:157 >> {'loss': 0.6469, 'learning_rate': 9.8313e-06, 'epoch': 0.27} |
|
|
|
[INFO|2025-01-08 23:05:42] logging.py:157 >> {'loss': 0.5828, 'learning_rate': 9.7961e-06, 'epoch': 0.27} |
|
|
|
[INFO|2025-01-08 23:07:09] logging.py:157 >> {'loss': 0.6873, 'learning_rate': 9.7577e-06, 'epoch': 0.28} |
|
|
|
[INFO|2025-01-08 23:08:42] logging.py:157 >> {'loss': 0.6477, 'learning_rate': 9.7160e-06, 'epoch': 0.29} |
|
|
|
[INFO|2025-01-08 23:09:55] logging.py:157 >> {'loss': 0.5904, 'learning_rate': 9.6712e-06, 'epoch': 0.29} |
|
|
|
[INFO|2025-01-08 23:11:23] logging.py:157 >> {'loss': 0.6119, 'learning_rate': 9.6231e-06, 'epoch': 0.30} |
|
|
|
[INFO|2025-01-08 23:12:48] logging.py:157 >> {'loss': 0.6829, 'learning_rate': 9.5720e-06, 'epoch': 0.31} |
|
|
|
[INFO|2025-01-08 23:14:11] logging.py:157 >> {'loss': 0.6048, 'learning_rate': 9.5177e-06, 'epoch': 0.31} |
|
|
|
[INFO|2025-01-08 23:15:31] logging.py:157 >> {'loss': 0.5616, 'learning_rate': 9.4603e-06, 'epoch': 0.32} |
|
|
|
[INFO|2025-01-08 23:16:56] logging.py:157 >> {'loss': 0.6373, 'learning_rate': 9.4000e-06, 'epoch': 0.33} |
|
|
|
[INFO|2025-01-08 23:18:25] logging.py:157 >> {'loss': 0.6382, 'learning_rate': 9.3366e-06, 'epoch': 0.33} |
|
|
|
[INFO|2025-01-08 23:19:46] logging.py:157 >> {'loss': 0.6601, 'learning_rate': 9.2703e-06, 'epoch': 0.34} |
|
|
|
[INFO|2025-01-08 23:21:08] logging.py:157 >> {'loss': 0.5767, 'learning_rate': 9.2011e-06, 'epoch': 0.35} |
|
|
|
[INFO|2025-01-08 23:22:35] logging.py:157 >> {'loss': 0.6323, 'learning_rate': 9.1291e-06, 'epoch': 0.35} |
|
|
|
[INFO|2025-01-08 23:23:47] logging.py:157 >> {'loss': 0.6733, 'learning_rate': 9.0543e-06, 'epoch': 0.36} |
|
|
|
[INFO|2025-01-08 23:25:11] logging.py:157 >> {'loss': 0.5903, 'learning_rate': 8.9767e-06, 'epoch': 0.37} |
|
|
|
[INFO|2025-01-08 23:26:40] logging.py:157 >> {'loss': 0.6102, 'learning_rate': 8.8964e-06, 'epoch': 0.37} |
|
|
|
[INFO|2025-01-08 23:27:54] logging.py:157 >> {'loss': 0.6564, 'learning_rate': 8.8134e-06, 'epoch': 0.38} |
|
|
|
[INFO|2025-01-08 23:29:23] logging.py:157 >> {'loss': 0.6031, 'learning_rate': 8.7279e-06, 'epoch': 0.39} |
|
|
|
[INFO|2025-01-08 23:30:43] logging.py:157 >> {'loss': 0.6093, 'learning_rate': 8.6398e-06, 'epoch': 0.39} |
|
|
|
[INFO|2025-01-08 23:32:02] logging.py:157 >> {'loss': 0.6295, 'learning_rate': 8.5493e-06, 'epoch': 0.40} |
|
|
|
[INFO|2025-01-08 23:33:35] logging.py:157 >> {'loss': 0.6299, 'learning_rate': 8.4564e-06, 'epoch': 0.41} |
|
|
|
[INFO|2025-01-08 23:35:06] logging.py:157 >> {'loss': 0.6483, 'learning_rate': 8.3611e-06, 'epoch': 0.41} |
|
|
|
[INFO|2025-01-08 23:36:23] logging.py:157 >> {'loss': 0.6620, 'learning_rate': 8.2636e-06, 'epoch': 0.42} |
|
|
|
[INFO|2025-01-08 23:37:50] logging.py:157 >> {'loss': 0.5819, 'learning_rate': 8.1638e-06, 'epoch': 0.42} |
|
|
|
[INFO|2025-01-08 23:39:14] logging.py:157 >> {'loss': 0.6009, 'learning_rate': 8.0619e-06, 'epoch': 0.43} |
|
|
|
[INFO|2025-01-08 23:40:42] logging.py:157 >> {'loss': 0.5853, 'learning_rate': 7.9579e-06, 'epoch': 0.44} |
|
|
|
[INFO|2025-01-08 23:42:14] logging.py:157 >> {'loss': 0.5848, 'learning_rate': 7.8519e-06, 'epoch': 0.44} |
|
|
|
[INFO|2025-01-08 23:43:29] logging.py:157 >> {'loss': 0.5962, 'learning_rate': 7.7439e-06, 'epoch': 0.45} |
|
|
|
[INFO|2025-01-08 23:44:56] logging.py:157 >> {'loss': 0.6368, 'learning_rate': 7.6341e-06, 'epoch': 0.46} |
|
|
|
[INFO|2025-01-08 23:46:17] logging.py:157 >> {'loss': 0.6256, 'learning_rate': 7.5225e-06, 'epoch': 0.46} |
|
|
|
[INFO|2025-01-08 23:47:44] logging.py:157 >> {'loss': 0.6009, 'learning_rate': 7.4092e-06, 'epoch': 0.47} |
|
|
|
[INFO|2025-01-08 23:49:14] logging.py:157 >> {'loss': 0.6153, 'learning_rate': 7.2943e-06, 'epoch': 0.48} |
|
|
|
[INFO|2025-01-08 23:50:37] logging.py:157 >> {'loss': 0.6381, 'learning_rate': 7.1778e-06, 'epoch': 0.48} |
|
|
|
[INFO|2025-01-08 23:51:56] logging.py:157 >> {'loss': 0.5699, 'learning_rate': 7.0598e-06, 'epoch': 0.49} |
|
|
|
[INFO|2025-01-08 23:53:17] logging.py:157 >> {'loss': 0.5915, 'learning_rate': 6.9405e-06, 'epoch': 0.50} |
|
|
|
[INFO|2025-01-08 23:54:47] logging.py:157 >> {'loss': 0.5981, 'learning_rate': 6.8198e-06, 'epoch': 0.50} |
|
|
|
[INFO|2025-01-08 23:56:08] logging.py:157 >> {'loss': 0.6397, 'learning_rate': 6.6979e-06, 'epoch': 0.51} |
|
|
|
[INFO|2025-01-08 23:57:37] logging.py:157 >> {'loss': 0.6400, 'learning_rate': 6.5748e-06, 'epoch': 0.52} |
|
|
|
[INFO|2025-01-08 23:59:02] logging.py:157 >> {'loss': 0.5730, 'learning_rate': 6.4506e-06, 'epoch': 0.52} |
|
|
|
[INFO|2025-01-09 00:00:35] logging.py:157 >> {'loss': 0.6446, 'learning_rate': 6.3255e-06, 'epoch': 0.53} |
|
|
|
[INFO|2025-01-09 00:02:12] logging.py:157 >> {'loss': 0.5955, 'learning_rate': 6.1995e-06, 'epoch': 0.54} |
|
|
|
[INFO|2025-01-09 00:03:30] logging.py:157 >> {'loss': 0.6338, 'learning_rate': 6.0727e-06, 'epoch': 0.54} |
|
|
|
[INFO|2025-01-09 00:04:59] logging.py:157 >> {'loss': 0.6026, 'learning_rate': 5.9451e-06, 'epoch': 0.55} |
|
|
|
[INFO|2025-01-09 00:06:19] logging.py:157 >> {'loss': 0.5776, 'learning_rate': 5.8169e-06, 'epoch': 0.56} |
|
|
|
[INFO|2025-01-09 00:07:46] logging.py:157 >> {'loss': 0.6135, 'learning_rate': 5.6881e-06, 'epoch': 0.56} |
|
|
|
[INFO|2025-01-09 00:09:21] logging.py:157 >> {'loss': 0.7029, 'learning_rate': 5.5589e-06, 'epoch': 0.57} |
|
|
|
[INFO|2025-01-09 00:10:44] logging.py:157 >> {'loss': 0.6328, 'learning_rate': 5.4293e-06, 'epoch': 0.58} |
|
|
|
[INFO|2025-01-09 00:12:06] logging.py:157 >> {'loss': 0.5873, 'learning_rate': 5.2994e-06, 'epoch': 0.58} |
|
|
|
[INFO|2025-01-09 00:13:37] logging.py:157 >> {'loss': 0.5935, 'learning_rate': 5.1693e-06, 'epoch': 0.59} |
|
|
|
[INFO|2025-01-09 00:15:13] logging.py:157 >> {'loss': 0.6324, 'learning_rate': 5.0391e-06, 'epoch': 0.60} |
|
|
|
[INFO|2025-01-09 00:16:40] logging.py:157 >> {'loss': 0.6311, 'learning_rate': 4.9088e-06, 'epoch': 0.60} |
|
|
|
[INFO|2025-01-09 00:18:08] logging.py:157 >> {'loss': 0.5941, 'learning_rate': 4.7786e-06, 'epoch': 0.61} |
|
|
|
[INFO|2025-01-09 00:19:35] logging.py:157 >> {'loss': 0.5964, 'learning_rate': 4.6486e-06, 'epoch': 0.62} |
|
|
|
[INFO|2025-01-09 00:20:53] logging.py:157 >> {'loss': 0.5988, 'learning_rate': 4.5188e-06, 'epoch': 0.62} |
|
|
|
[INFO|2025-01-09 00:22:17] logging.py:157 >> {'loss': 0.5662, 'learning_rate': 4.3894e-06, 'epoch': 0.63} |
|
|
|
[INFO|2025-01-09 00:23:34] logging.py:157 >> {'loss': 0.6347, 'learning_rate': 4.2603e-06, 'epoch': 0.64} |
|
|
|
[INFO|2025-01-09 00:25:13] logging.py:157 >> {'loss': 0.6139, 'learning_rate': 4.1318e-06, 'epoch': 0.64} |
|
|
|
[INFO|2025-01-09 00:26:40] logging.py:157 >> {'loss': 0.5789, 'learning_rate': 4.0038e-06, 'epoch': 0.65} |
|
|
|
[INFO|2025-01-09 00:28:12] logging.py:157 >> {'loss': 0.5833, 'learning_rate': 3.8765e-06, 'epoch': 0.66} |
|
|
|
[INFO|2025-01-09 00:29:29] logging.py:157 >> {'loss': 0.5866, 'learning_rate': 3.7500e-06, 'epoch': 0.66} |
|
|
|
[INFO|2025-01-09 00:29:29] trainer.py:4117 >> |
|
***** Running Evaluation ***** |
|
|
|
[INFO|2025-01-09 00:29:29] trainer.py:4119 >> Num examples = 243 |
|
|
|
[INFO|2025-01-09 00:29:29] trainer.py:4122 >> Batch size = 2 |
|
|
|
[INFO|2025-01-09 00:29:44] trainer.py:3801 >> Saving model checkpoint to /root/autodl-tmp/saves/model/checkpoint-500 |
|
|
|
[INFO|2025-01-09 00:29:44] configuration_utils.py:414 >> Configuration saved in /root/autodl-tmp/saves/model/checkpoint-500/config.json |
|
|
|
[INFO|2025-01-09 00:29:44] configuration_utils.py:865 >> Configuration saved in /root/autodl-tmp/saves/model/checkpoint-500/generation_config.json |
|
|
|
[INFO|2025-01-09 00:29:48] modeling_utils.py:3043 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /root/autodl-tmp/saves/model/checkpoint-500/model.safetensors.index.json. |
|
|
|
[INFO|2025-01-09 00:29:48] tokenization_utils_base.py:2646 >> tokenizer config file saved in /root/autodl-tmp/saves/model/checkpoint-500/tokenizer_config.json |
|
|
|
[INFO|2025-01-09 00:29:48] tokenization_utils_base.py:2655 >> Special tokens file saved in /root/autodl-tmp/saves/model/checkpoint-500/special_tokens_map.json |
|
|
|
[INFO|2025-01-09 00:31:26] logging.py:157 >> {'loss': 0.6531, 'learning_rate': 3.6243e-06, 'epoch': 0.67} |
|
|
|
[INFO|2025-01-09 00:32:50] logging.py:157 >> {'loss': 0.6374, 'learning_rate': 3.4996e-06, 'epoch': 0.68} |
|
|
|
[INFO|2025-01-09 00:34:16] logging.py:157 >> {'loss': 0.6109, 'learning_rate': 3.3759e-06, 'epoch': 0.68} |
|
|
|
[INFO|2025-01-09 00:35:36] logging.py:157 >> {'loss': 0.6006, 'learning_rate': 3.2532e-06, 'epoch': 0.69} |
|
|
|
[INFO|2025-01-09 00:37:03] logging.py:157 >> {'loss': 0.6040, 'learning_rate': 3.1318e-06, 'epoch': 0.70} |
|
|
|
[INFO|2025-01-09 00:38:22] logging.py:157 >> {'loss': 0.5694, 'learning_rate': 3.0116e-06, 'epoch': 0.70} |
|
|
|
[INFO|2025-01-09 00:39:39] logging.py:157 >> {'loss': 0.6321, 'learning_rate': 2.8928e-06, 'epoch': 0.71} |
|
|
|
[INFO|2025-01-09 00:41:14] logging.py:157 >> {'loss': 0.5404, 'learning_rate': 2.7754e-06, 'epoch': 0.72} |
|
|
|
[INFO|2025-01-09 00:42:47] logging.py:157 >> {'loss': 0.5815, 'learning_rate': 2.6595e-06, 'epoch': 0.72} |
|
|
|
[INFO|2025-01-09 00:44:07] logging.py:157 >> {'loss': 0.5839, 'learning_rate': 2.5453e-06, 'epoch': 0.73} |
|
|
|
[INFO|2025-01-09 00:45:28] logging.py:157 >> {'loss': 0.6658, 'learning_rate': 2.4326e-06, 'epoch': 0.74} |
|
|
|
[INFO|2025-01-09 00:46:51] logging.py:157 >> {'loss': 0.5837, 'learning_rate': 2.3217e-06, 'epoch': 0.74} |
|
|
|
[INFO|2025-01-09 00:48:10] logging.py:157 >> {'loss': 0.6245, 'learning_rate': 2.2127e-06, 'epoch': 0.75} |
|
|
|
[INFO|2025-01-09 00:49:36] logging.py:157 >> {'loss': 0.6048, 'learning_rate': 2.1055e-06, 'epoch': 0.76} |
|
|
|
[INFO|2025-01-09 00:51:05] logging.py:157 >> {'loss': 0.6308, 'learning_rate': 2.0003e-06, 'epoch': 0.76} |
|
|
|
[INFO|2025-01-09 00:52:30] logging.py:157 >> {'loss': 0.6318, 'learning_rate': 1.8971e-06, 'epoch': 0.77} |
|
|
|
[INFO|2025-01-09 00:53:50] logging.py:157 >> {'loss': 0.6230, 'learning_rate': 1.7961e-06, 'epoch': 0.78} |
|
|
|
[INFO|2025-01-09 00:55:06] logging.py:157 >> {'loss': 0.5935, 'learning_rate': 1.6972e-06, 'epoch': 0.78} |
|
|
|
[INFO|2025-01-09 00:56:26] logging.py:157 >> {'loss': 0.5855, 'learning_rate': 1.6005e-06, 'epoch': 0.79} |
|
|
|
[INFO|2025-01-09 00:57:47] logging.py:157 >> {'loss': 0.6031, 'learning_rate': 1.5062e-06, 'epoch': 0.80} |
|
|
|
[INFO|2025-01-09 00:59:06] logging.py:157 >> {'loss': 0.6448, 'learning_rate': 1.4142e-06, 'epoch': 0.80} |
|
|
|
[INFO|2025-01-09 01:00:36] logging.py:157 >> {'loss': 0.5545, 'learning_rate': 1.3246e-06, 'epoch': 0.81} |
|
|
|
[INFO|2025-01-09 01:02:11] logging.py:157 >> {'loss': 0.6038, 'learning_rate': 1.2376e-06, 'epoch': 0.82} |
|
|
|
[INFO|2025-01-09 01:03:43] logging.py:157 >> {'loss': 0.5847, 'learning_rate': 1.1531e-06, 'epoch': 0.82} |
|
|
|
[INFO|2025-01-09 01:05:10] logging.py:157 >> {'loss': 0.6027, 'learning_rate': 1.0712e-06, 'epoch': 0.83} |
|
|
|
[INFO|2025-01-09 01:06:20] logging.py:157 >> {'loss': 0.6066, 'learning_rate': 9.9198e-07, 'epoch': 0.84} |
|
|
|
[INFO|2025-01-09 01:07:37] logging.py:157 >> {'loss': 0.5833, 'learning_rate': 9.1548e-07, 'epoch': 0.84} |
|
|
|
[INFO|2025-01-09 01:08:53] logging.py:157 >> {'loss': 0.6100, 'learning_rate': 8.4175e-07, 'epoch': 0.85} |
|
|
|
[INFO|2025-01-09 01:10:24] logging.py:157 >> {'loss': 0.5816, 'learning_rate': 7.7084e-07, 'epoch': 0.86} |
|
|
|
[INFO|2025-01-09 01:12:06] logging.py:157 >> {'loss': 0.6149, 'learning_rate': 7.0280e-07, 'epoch': 0.86} |
|
|
|
[INFO|2025-01-09 01:13:35] logging.py:157 >> {'loss': 0.5452, 'learning_rate': 6.3768e-07, 'epoch': 0.87} |
|
|
|
[INFO|2025-01-09 01:14:47] logging.py:157 >> {'loss': 0.6303, 'learning_rate': 5.7552e-07, 'epoch': 0.88} |
|
|
|
[INFO|2025-01-09 01:16:02] logging.py:157 >> {'loss': 0.5684, 'learning_rate': 5.1636e-07, 'epoch': 0.88} |
|
|
|
[INFO|2025-01-09 01:17:27] logging.py:157 >> {'loss': 0.5962, 'learning_rate': 4.6024e-07, 'epoch': 0.89} |
|
|
|
[INFO|2025-01-09 01:18:55] logging.py:157 >> {'loss': 0.6449, 'learning_rate': 4.0720e-07, 'epoch': 0.90} |
|
|
|
[INFO|2025-01-09 01:20:17] logging.py:157 >> {'loss': 0.6248, 'learning_rate': 3.5728e-07, 'epoch': 0.90} |
|
|
|
[INFO|2025-01-09 01:21:46] logging.py:157 >> {'loss': 0.6428, 'learning_rate': 3.1051e-07, 'epoch': 0.91} |
|
|
|
[INFO|2025-01-09 01:23:08] logging.py:157 >> {'loss': 0.5535, 'learning_rate': 2.6692e-07, 'epoch': 0.92} |
|
|
|
[INFO|2025-01-09 01:24:34] logging.py:157 >> {'loss': 0.6079, 'learning_rate': 2.2654e-07, 'epoch': 0.92} |
|
|
|
[INFO|2025-01-09 01:25:54] logging.py:157 >> {'loss': 0.5986, 'learning_rate': 1.8941e-07, 'epoch': 0.93} |
|
|
|
[INFO|2025-01-09 01:27:17] logging.py:157 >> {'loss': 0.5788, 'learning_rate': 1.5553e-07, 'epoch': 0.94} |
|
|
|
[INFO|2025-01-09 01:28:48] logging.py:157 >> {'loss': 0.5920, 'learning_rate': 1.2495e-07, 'epoch': 0.94} |
|
|
|
[INFO|2025-01-09 01:30:12] logging.py:157 >> {'loss': 0.5842, 'learning_rate': 9.7668e-08, 'epoch': 0.95} |
|
|
|
[INFO|2025-01-09 01:31:47] logging.py:157 >> {'loss': 0.5153, 'learning_rate': 7.3716e-08, 'epoch': 0.96} |
|
|
|
[INFO|2025-01-09 01:32:58] logging.py:157 >> {'loss': 0.5691, 'learning_rate': 5.3107e-08, 'epoch': 0.96} |
|
|
|
[INFO|2025-01-09 01:34:35] logging.py:157 >> {'loss': 0.5571, 'learning_rate': 3.5854e-08, 'epoch': 0.97} |
|
|
|
[INFO|2025-01-09 01:36:03] logging.py:157 >> {'loss': 0.5729, 'learning_rate': 2.1970e-08, 'epoch': 0.98} |
|
|
|
[INFO|2025-01-09 01:37:22] logging.py:157 >> {'loss': 0.5342, 'learning_rate': 1.1464e-08, 'epoch': 0.98} |
|
|
|
[INFO|2025-01-09 01:38:51] logging.py:157 >> {'loss': 0.5942, 'learning_rate': 4.3423e-09, 'epoch': 0.99} |
|
|
|
[INFO|2025-01-09 01:40:12] logging.py:157 >> {'loss': 0.6329, 'learning_rate': 6.1072e-10, 'epoch': 1.00} |
|
|
|
[INFO|2025-01-09 01:41:03] trainer.py:3801 >> Saving model checkpoint to /root/autodl-tmp/saves/model/checkpoint-753 |
|
|
|
[INFO|2025-01-09 01:41:03] configuration_utils.py:414 >> Configuration saved in /root/autodl-tmp/saves/model/checkpoint-753/config.json |
|
|
|
[INFO|2025-01-09 01:41:03] configuration_utils.py:865 >> Configuration saved in /root/autodl-tmp/saves/model/checkpoint-753/generation_config.json |
|
|
|
[INFO|2025-01-09 01:41:08] modeling_utils.py:3043 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /root/autodl-tmp/saves/model/checkpoint-753/model.safetensors.index.json. |
|
|
|
[INFO|2025-01-09 01:41:08] tokenization_utils_base.py:2646 >> tokenizer config file saved in /root/autodl-tmp/saves/model/checkpoint-753/tokenizer_config.json |
|
|
|
[INFO|2025-01-09 01:41:08] tokenization_utils_base.py:2655 >> Special tokens file saved in /root/autodl-tmp/saves/model/checkpoint-753/special_tokens_map.json |
|
|
|
[INFO|2025-01-09 01:41:27] trainer.py:2584 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|2025-01-09 01:41:29] trainer.py:3801 >> Saving model checkpoint to /root/autodl-tmp/saves/model |
|
|
|
[INFO|2025-01-09 01:41:29] configuration_utils.py:414 >> Configuration saved in /root/autodl-tmp/saves/model/config.json |
|
|
|
[INFO|2025-01-09 01:41:29] configuration_utils.py:865 >> Configuration saved in /root/autodl-tmp/saves/model/generation_config.json |
|
|
|
[INFO|2025-01-09 01:41:35] modeling_utils.py:3043 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /root/autodl-tmp/saves/model/model.safetensors.index.json. |
|
|
|
[INFO|2025-01-09 01:41:35] tokenization_utils_base.py:2646 >> tokenizer config file saved in /root/autodl-tmp/saves/model/tokenizer_config.json |
|
|
|
[INFO|2025-01-09 01:41:35] tokenization_utils_base.py:2655 >> Special tokens file saved in /root/autodl-tmp/saves/model/special_tokens_map.json |
|
|
|
[WARNING|2025-01-09 01:41:35] logging.py:162 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|2025-01-09 01:41:35] trainer.py:4117 >> |
|
***** Running Evaluation ***** |
|
|
|
[INFO|2025-01-09 01:41:35] trainer.py:4119 >> Num examples = 243 |
|
|
|
[INFO|2025-01-09 01:41:35] trainer.py:4122 >> Batch size = 2 |
|
|
|
[INFO|2025-01-09 01:41:47] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|