|
[2024-12-04 14:10:38,207] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:39,754] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. |
|
[2024-12-04 14:10:39,754] [INFO] [runner.py:571:main] cmd = /vol3/ctr/.conda/envs/llava_rest/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNV19 --master_addr=127.0.0.1 --master_port=29504 --enable_each_rank_log=None llava/train/train_mem.py --deepspeed /vol3/home/ctr/llava-rlhf/LLaVA-REST-MCTS/models/LLaVA/scripts/zero3_offload.json --model_name_or_path /vol3/home/ctr/llava-rlhf/models/llava-v1.5-7b --version v1 --data_path /vol3/home/ctr/llava-rlhf/datasets/aokvqa/aokvqa_policy_train.json --image_folder /vol3/home/ctr/llava-rlhf/datasets/coco --vision_tower /vol3/home/ctr/llava-rlhf/models/clip-vit-large-patch14-336 --mm_projector_type mlp2x_gelu --mm_vision_select_layer -2 --mm_use_im_start_end False --mm_use_im_patch_token False --image_aspect_ratio pad --group_by_modality_length True --bf16 True --output_dir /vol3/home/ctr/llava-rlhf/models/llava-v1.5-7b-sft-policy-v2 --num_train_epochs 3 --per_device_train_batch_size 16 --per_device_eval_batch_size 8 --gradient_accumulation_steps 2 --evaluation_strategy no --save_strategy steps --save_steps 100 --save_total_limit 3 --learning_rate 5e-6 --weight_decay 0.05 --warmup_ratio 0.1 --lr_scheduler_type cosine --logging_steps 1 --tf32 True --model_max_length 2048 --gradient_checkpointing True --dataloader_num_workers 8 --lazy_preprocess True --report_to wandb |
|
[2024-12-04 14:10:42,757] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:44,308] [INFO] [launch.py:138:main] 0 NCCL_TIMEOUT=360 |
|
[2024-12-04 14:10:44,308] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=360 |
|
[2024-12-04 14:10:44,308] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5]} |
|
[2024-12-04 14:10:44,308] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=6, node_rank=0 |
|
[2024-12-04 14:10:44,308] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5]}) |
|
[2024-12-04 14:10:44,308] [INFO] [launch.py:163:main] dist_world_size=6 |
|
[2024-12-04 14:10:44,308] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 |
|
[2024-12-04 14:10:48,071] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:48,370] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:48,394] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:48,408] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:48,450] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:48,473] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) |
|
[2024-12-04 14:10:49,624] [INFO] [comm.py:637:init_distributed] cdb=None |
|
[2024-12-04 14:10:49,895] [INFO] [comm.py:637:init_distributed] cdb=None |
|
[2024-12-04 14:10:49,895] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl |
|
[2024-12-04 14:10:49,935] [INFO] [comm.py:637:init_distributed] cdb=None |
|
[2024-12-04 14:10:49,974] [INFO] [comm.py:637:init_distributed] cdb=None |
|
[2024-12-04 14:10:50,051] [INFO] [comm.py:637:init_distributed] cdb=None |
|
[2024-12-04 14:10:50,072] [INFO] [comm.py:637:init_distributed] cdb=None |
|
model_args: ModelArguments(model_name_or_path='/vol3/home/ctr/llava-rlhf/models/llava-v1.5-7b', version='v1', freeze_backbone=False, tune_mm_mlp_adapter=False, vision_tower='/vol3/home/ctr/llava-rlhf/models/clip-vit-large-patch14-336', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, mm_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch') |
|
data_args: DataArguments(data_path='/vol3/home/ctr/llava-rlhf/datasets/aokvqa/aokvqa_policy_train.json', lazy_preprocess=True, is_multimodal=False, image_folder='/vol3/home/ctr/llava-rlhf/datasets/coco', image_aspect_ratio='pad') |
|
training_args: TrainingArguments( |
|
_n_gpu=1, |
|
adafactor=False, |
|
adam_beta1=0.9, |
|
adam_beta2=0.999, |
|
adam_epsilon=1e-08, |
|
auto_find_batch_size=False, |
|
bf16=True, |
|
bf16_full_eval=False, |
|
bits=16, |
|
cache_dir=None, |
|
data_seed=None, |
|
dataloader_drop_last=False, |
|
dataloader_num_workers=8, |
|
dataloader_persistent_workers=False, |
|
dataloader_pin_memory=True, |
|
ddp_backend=None, |
|
ddp_broadcast_buffers=None, |
|
ddp_bucket_cap_mb=None, |
|
ddp_find_unused_parameters=None, |
|
ddp_timeout=1800, |
|
debug=[], |
|
deepspeed=/vol3/home/ctr/llava-rlhf/LLaVA-REST-MCTS/models/LLaVA/scripts/zero3_offload.json, |
|
disable_tqdm=False, |
|
dispatch_batches=None, |
|
do_eval=False, |
|
do_predict=False, |
|
do_train=False, |
|
double_quant=True, |
|
eval_accumulation_steps=None, |
|
eval_delay=0, |
|
eval_steps=None, |
|
evaluation_strategy=no, |
|
fp16=False, |
|
fp16_backend=auto, |
|
fp16_full_eval=False, |
|
fp16_opt_level=O1, |
|
freeze_mm_mlp_adapter=False, |
|
fsdp=[], |
|
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, |
|
fsdp_min_num_params=0, |
|
fsdp_transformer_layer_cls_to_wrap=None, |
|
full_determinism=False, |
|
gradient_accumulation_steps=2, |
|
gradient_checkpointing=True, |
|
gradient_checkpointing_kwargs=None, |
|
greater_is_better=None, |
|
group_by_length=False, |
|
group_by_modality_length=True, |
|
half_precision_backend=auto, |
|
hub_always_push=False, |
|
hub_model_id=None, |
|
hub_private_repo=False, |
|
hub_strategy=every_save, |
|
hub_token=<HUB_TOKEN>, |
|
ignore_data_skip=False, |
|
include_inputs_for_metrics=False, |
|
include_num_input_tokens_seen=False, |
|
include_tokens_per_second=False, |
|
jit_mode_eval=False, |
|
label_names=None, |
|
label_smoothing_factor=0.0, |
|
learning_rate=5e-06, |
|
length_column_name=length, |
|
load_best_model_at_end=False, |
|
local_rank=0, |
|
log_level=passive, |
|
log_level_replica=warning, |
|
log_on_each_node=True, |
|
logging_dir=/vol3/home/ctr/llava-rlhf/models/llava-v1.5-7b-sft-policy-v2/runs/Dec04_14-10-49_a102, |
|
logging_first_step=False, |
|
logging_nan_inf_filter=True, |
|
logging_steps=1.0, |
|
logging_strategy=steps, |
|
lora_alpha=16, |
|
lora_bias=none, |
|
lora_dropout=0.05, |
|
lora_enable=False, |
|
lora_r=64, |
|
lora_weight_path=, |
|
lr_scheduler_kwargs={}, |
|
lr_scheduler_type=cosine, |
|
max_grad_norm=1.0, |
|
max_steps=-1, |
|
metric_for_best_model=None, |
|
mm_projector_lr=None, |
|
model_max_length=2048, |
|
mp_parameters=, |
|
mpt_attn_impl=triton, |
|
neftune_noise_alpha=None, |
|
no_cuda=False, |
|
num_train_epochs=3.0, |
|
optim=adamw_torch, |
|
optim_args=None, |
|
output_dir=/vol3/home/ctr/llava-rlhf/models/llava-v1.5-7b-sft-policy-v2, |
|
overwrite_output_dir=False, |
|
past_index=-1, |
|
per_device_eval_batch_size=8, |
|
per_device_train_batch_size=16, |
|
prediction_loss_only=False, |
|
push_to_hub=False, |
|
push_to_hub_model_id=None, |
|
push_to_hub_organization=None, |
|
push_to_hub_token=<PUSH_TO_HUB_TOKEN>, |
|
quant_type=nf4, |
|
ray_scope=last, |
|
remove_unused_columns=False, |
|
report_to=['wandb'], |
|
resume_from_checkpoint=None, |
|
run_name=/vol3/home/ctr/llava-rlhf/models/llava-v1.5-7b-sft-policy-v2, |
|
save_on_each_node=False, |
|
save_only_model=False, |
|
save_safetensors=True, |
|
save_steps=100, |
|
save_strategy=steps, |
|
save_total_limit=3, |
|
seed=42, |
|
skip_memory_metrics=True, |
|
split_batches=False, |
|
tf32=True, |
|
torch_compile=False, |
|
torch_compile_backend=None, |
|
torch_compile_mode=None, |
|
torchdynamo=None, |
|
tpu_metrics_debug=False, |
|
tpu_num_cores=None, |
|
use_cpu=False, |
|
use_ipex=False, |
|
use_legacy_prediction_loop=False, |
|
use_mps_device=False, |
|
warmup_ratio=0.1, |
|
warmup_steps=0, |
|
weight_decay=0.05, |
|
) |
|
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors. |
|
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. |
|
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors. |
|
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. |
|
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors. |
|
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors. |
|
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. |
|
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. |
|
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors. |
|
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. |
|
You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors. |
|
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. |
|
[2024-12-04 14:11:17,198] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 295, num_elems = 6.76B |
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage() |
|
return self.fget.__get__(instance, owner)() |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage() |
|
return self.fget.__get__(instance, owner)() |
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage() |
|
return self.fget.__get__(instance, owner)() |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage() |
|
return self.fget.__get__(instance, owner)() |
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage() |
|
return self.fget.__get__(instance, owner)() |
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage() |
|
return self.fget.__get__(instance, owner)() |
|
Loading checkpoint shards: 50%|βββββ | 1/2 [00:15<00:15, 15.48s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:15<00:15, 15.48s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:15<00:15, 15.46s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:15<00:15, 15.46s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:15<00:15, 15.50s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:16<00:16, 16.18s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 9.63s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 10.51s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 9.63s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 10.51s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:20<00:00, 9.62s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:20<00:00, 10.50s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 9.63s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 10.51s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 9.63s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 10.50s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 9.65s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:21<00:00, 10.63s/it] |
|
LlavaLlamaForCausalLM( |
|
(model): LlavaLlamaModel( |
|
(embed_tokens): Embedding(32000, 4096, padding_idx=0) |
|
(layers): ModuleList( |
|
(0-31): 32 x LlamaDecoderLayer( |
|
(self_attn): LlamaFlashAttention2( |
|
(q_proj): Linear(in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(in_features=4096, out_features=4096, bias=False) |
|
(v_proj): Linear(in_features=4096, out_features=4096, bias=False) |
|
(o_proj): Linear(in_features=4096, out_features=4096, bias=False) |
|
(rotary_emb): LlamaRotaryEmbedding() |
|
) |
|
(mlp): LlamaMLP( |
|
(gate_proj): Linear(in_features=4096, out_features=11008, bias=False) |
|
(up_proj): Linear(in_features=4096, out_features=11008, bias=False) |
|
(down_proj): Linear(in_features=11008, out_features=4096, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): LlamaRMSNorm() |
|
(post_attention_layernorm): LlamaRMSNorm() |
|
) |
|
) |
|
(norm): LlamaRMSNorm() |
|
(vision_tower): CLIPVisionTower() |
|
(mm_projector): Sequential( |
|
(0): Linear(in_features=1024, out_features=4096, bias=True) |
|
(1): GELU(approximate='none') |
|
(2): Linear(in_features=4096, out_features=4096, bias=True) |
|
) |
|
) |
|
(lm_head): Linear(in_features=4096, out_features=32000, bias=False) |
|
) |
|
[2024-12-04 14:11:45,079] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 686, num_elems = 7.06B |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations |
|
warnings.warn( |
|
Formatting inputs...Skip in lazy mode |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations |
|
warnings.warn( |
|
model.embed_tokens.weight |
|
model.layers.0.self_attn.q_proj.weight |
|
model.layers.0.self_attn.k_proj.weight |
|
model.layers.0.self_attn.v_proj.weight |
|
model.layers.0.self_attn.o_proj.weight |
|
model.layers.0.mlp.gate_proj.weight |
|
model.layers.0.mlp.up_proj.weight |
|
model.layers.0.mlp.down_proj.weight |
|
model.layers.0.input_layernorm.weight |
|
model.layers.0.post_attention_layernorm.weight |
|
model.layers.1.self_attn.q_proj.weight |
|
model.layers.1.self_attn.k_proj.weight |
|
model.layers.1.self_attn.v_proj.weight |
|
model.layers.1.self_attn.o_proj.weight |
|
model.layers.1.mlp.gate_proj.weight |
|
model.layers.1.mlp.up_proj.weight |
|
model.layers.1.mlp.down_proj.weight |
|
model.layers.1.input_layernorm.weight |
|
model.layers.1.post_attention_layernorm.weight |
|
model.layers.2.self_attn.q_proj.weight |
|
model.layers.2.self_attn.k_proj.weight |
|
model.layers.2.self_attn.v_proj.weight |
|
model.layers.2.self_attn.o_proj.weight |
|
model.layers.2.mlp.gate_proj.weight |
|
model.layers.2.mlp.up_proj.weight |
|
model.layers.2.mlp.down_proj.weight |
|
model.layers.2.input_layernorm.weight |
|
model.layers.2.post_attention_layernorm.weight |
|
model.layers.3.self_attn.q_proj.weight |
|
model.layers.3.self_attn.k_proj.weight |
|
model.layers.3.self_attn.v_proj.weight |
|
model.layers.3.self_attn.o_proj.weight |
|
model.layers.3.mlp.gate_proj.weight |
|
model.layers.3.mlp.up_proj.weight |
|
model.layers.3.mlp.down_proj.weight |
|
model.layers.3.input_layernorm.weight |
|
model.layers.3.post_attention_layernorm.weight |
|
model.layers.4.self_attn.q_proj.weight |
|
model.layers.4.self_attn.k_proj.weight |
|
model.layers.4.self_attn.v_proj.weight |
|
model.layers.4.self_attn.o_proj.weight |
|
model.layers.4.mlp.gate_proj.weight |
|
model.layers.4.mlp.up_proj.weight |
|
model.layers.4.mlp.down_proj.weight |
|
model.layers.4.input_layernorm.weight |
|
model.layers.4.post_attention_layernorm.weight |
|
model.layers.5.self_attn.q_proj.weight |
|
model.layers.5.self_attn.k_proj.weight |
|
model.layers.5.self_attn.v_proj.weight |
|
model.layers.5.self_attn.o_proj.weight |
|
model.layers.5.mlp.gate_proj.weight |
|
model.layers.5.mlp.up_proj.weight |
|
model.layers.5.mlp.down_proj.weight |
|
model.layers.5.input_layernorm.weight |
|
model.layers.5.post_attention_layernorm.weight |
|
model.layers.6.self_attn.q_proj.weight |
|
model.layers.6.self_attn.k_proj.weight |
|
model.layers.6.self_attn.v_proj.weight |
|
model.layers.6.self_attn.o_proj.weight |
|
model.layers.6.mlp.gate_proj.weight |
|
model.layers.6.mlp.up_proj.weight |
|
model.layers.6.mlp.down_proj.weight |
|
model.layers.6.input_layernorm.weight |
|
model.layers.6.post_attention_layernorm.weight |
|
model.layers.7.self_attn.q_proj.weight |
|
model.layers.7.self_attn.k_proj.weight |
|
model.layers.7.self_attn.v_proj.weight |
|
model.layers.7.self_attn.o_proj.weight |
|
model.layers.7.mlp.gate_proj.weight |
|
model.layers.7.mlp.up_proj.weight |
|
model.layers.7.mlp.down_proj.weight |
|
model.layers.7.input_layernorm.weight |
|
model.layers.7.post_attention_layernorm.weight |
|
model.layers.8.self_attn.q_proj.weight |
|
model.layers.8.self_attn.k_proj.weight |
|
model.layers.8.self_attn.v_proj.weight |
|
model.layers.8.self_attn.o_proj.weight |
|
model.layers.8.mlp.gate_proj.weight |
|
model.layers.8.mlp.up_proj.weight |
|
model.layers.8.mlp.down_proj.weight |
|
model.layers.8.input_layernorm.weight |
|
model.layers.8.post_attention_layernorm.weight |
|
model.layers.9.self_attn.q_proj.weight |
|
model.layers.9.self_attn.k_proj.weight |
|
model.layers.9.self_attn.v_proj.weight |
|
model.layers.9.self_attn.o_proj.weight |
|
model.layers.9.mlp.gate_proj.weight |
|
model.layers.9.mlp.up_proj.weight |
|
model.layers.9.mlp.down_proj.weight |
|
model.layers.9.input_layernorm.weight |
|
model.layers.9.post_attention_layernorm.weight |
|
model.layers.10.self_attn.q_proj.weight |
|
model.layers.10.self_attn.k_proj.weight |
|
model.layers.10.self_attn.v_proj.weight |
|
model.layers.10.self_attn.o_proj.weight |
|
model.layers.10.mlp.gate_proj.weight |
|
model.layers.10.mlp.up_proj.weight |
|
model.layers.10.mlp.down_proj.weight |
|
model.layers.10.input_layernorm.weight |
|
model.layers.10.post_attention_layernorm.weight |
|
model.layers.11.self_attn.q_proj.weight |
|
model.layers.11.self_attn.k_proj.weight |
|
model.layers.11.self_attn.v_proj.weight |
|
model.layers.11.self_attn.o_proj.weight |
|
model.layers.11.mlp.gate_proj.weight |
|
model.layers.11.mlp.up_proj.weight |
|
model.layers.11.mlp.down_proj.weight |
|
model.layers.11.input_layernorm.weight |
|
model.layers.11.post_attention_layernorm.weight |
|
model.layers.12.self_attn.q_proj.weight |
|
model.layers.12.self_attn.k_proj.weight |
|
model.layers.12.self_attn.v_proj.weight |
|
model.layers.12.self_attn.o_proj.weight |
|
model.layers.12.mlp.gate_proj.weight |
|
model.layers.12.mlp.up_proj.weight |
|
model.layers.12.mlp.down_proj.weight |
|
model.layers.12.input_layernorm.weight |
|
model.layers.12.post_attention_layernorm.weight |
|
model.layers.13.self_attn.q_proj.weight |
|
model.layers.13.self_attn.k_proj.weight |
|
model.layers.13.self_attn.v_proj.weight |
|
model.layers.13.self_attn.o_proj.weight |
|
model.layers.13.mlp.gate_proj.weight |
|
model.layers.13.mlp.up_proj.weight |
|
model.layers.13.mlp.down_proj.weight |
|
model.layers.13.input_layernorm.weight |
|
model.layers.13.post_attention_layernorm.weight |
|
model.layers.14.self_attn.q_proj.weight |
|
model.layers.14.self_attn.k_proj.weight |
|
model.layers.14.self_attn.v_proj.weight |
|
model.layers.14.self_attn.o_proj.weight |
|
model.layers.14.mlp.gate_proj.weight |
|
model.layers.14.mlp.up_proj.weight |
|
model.layers.14.mlp.down_proj.weight |
|
model.layers.14.input_layernorm.weight |
|
model.layers.14.post_attention_layernorm.weight |
|
model.layers.15.self_attn.q_proj.weight |
|
model.layers.15.self_attn.k_proj.weight |
|
model.layers.15.self_attn.v_proj.weight |
|
model.layers.15.self_attn.o_proj.weight |
|
model.layers.15.mlp.gate_proj.weight |
|
model.layers.15.mlp.up_proj.weight |
|
model.layers.15.mlp.down_proj.weight |
|
model.layers.15.input_layernorm.weight |
|
model.layers.15.post_attention_layernorm.weight |
|
model.layers.16.self_attn.q_proj.weight |
|
model.layers.16.self_attn.k_proj.weight |
|
model.layers.16.self_attn.v_proj.weight |
|
model.layers.16.self_attn.o_proj.weight |
|
model.layers.16.mlp.gate_proj.weight |
|
model.layers.16.mlp.up_proj.weight |
|
model.layers.16.mlp.down_proj.weight |
|
model.layers.16.input_layernorm.weight |
|
model.layers.16.post_attention_layernorm.weight |
|
model.layers.17.self_attn.q_proj.weight |
|
model.layers.17.self_attn.k_proj.weight |
|
model.layers.17.self_attn.v_proj.weight |
|
model.layers.17.self_attn.o_proj.weight |
|
model.layers.17.mlp.gate_proj.weight |
|
model.layers.17.mlp.up_proj.weight |
|
model.layers.17.mlp.down_proj.weight |
|
model.layers.17.input_layernorm.weight |
|
model.layers.17.post_attention_layernorm.weight |
|
model.layers.18.self_attn.q_proj.weight |
|
model.layers.18.self_attn.k_proj.weight |
|
model.layers.18.self_attn.v_proj.weight |
|
model.layers.18.self_attn.o_proj.weight |
|
model.layers.18.mlp.gate_proj.weight |
|
model.layers.18.mlp.up_proj.weight |
|
model.layers.18.mlp.down_proj.weight |
|
model.layers.18.input_layernorm.weight |
|
model.layers.18.post_attention_layernorm.weight |
|
model.layers.19.self_attn.q_proj.weight |
|
model.layers.19.self_attn.k_proj.weight |
|
model.layers.19.self_attn.v_proj.weight |
|
model.layers.19.self_attn.o_proj.weight |
|
model.layers.19.mlp.gate_proj.weight |
|
model.layers.19.mlp.up_proj.weight |
|
model.layers.19.mlp.down_proj.weight |
|
model.layers.19.input_layernorm.weight |
|
model.layers.19.post_attention_layernorm.weight |
|
model.layers.20.self_attn.q_proj.weight |
|
model.layers.20.self_attn.k_proj.weight |
|
model.layers.20.self_attn.v_proj.weight |
|
model.layers.20.self_attn.o_proj.weight |
|
model.layers.20.mlp.gate_proj.weight |
|
model.layers.20.mlp.up_proj.weight |
|
model.layers.20.mlp.down_proj.weight |
|
model.layers.20.input_layernorm.weight |
|
model.layers.20.post_attention_layernorm.weight |
|
model.layers.21.self_attn.q_proj.weight |
|
model.layers.21.self_attn.k_proj.weight |
|
model.layers.21.self_attn.v_proj.weight |
|
model.layers.21.self_attn.o_proj.weight |
|
model.layers.21.mlp.gate_proj.weight |
|
model.layers.21.mlp.up_proj.weight |
|
model.layers.21.mlp.down_proj.weight |
|
model.layers.21.input_layernorm.weight |
|
model.layers.21.post_attention_layernorm.weight |
|
model.layers.22.self_attn.q_proj.weight |
|
model.layers.22.self_attn.k_proj.weight |
|
model.layers.22.self_attn.v_proj.weight |
|
model.layers.22.self_attn.o_proj.weight |
|
model.layers.22.mlp.gate_proj.weight |
|
model.layers.22.mlp.up_proj.weight |
|
model.layers.22.mlp.down_proj.weight |
|
model.layers.22.input_layernorm.weight |
|
model.layers.22.post_attention_layernorm.weight |
|
model.layers.23.self_attn.q_proj.weight |
|
model.layers.23.self_attn.k_proj.weight |
|
model.layers.23.self_attn.v_proj.weight |
|
model.layers.23.self_attn.o_proj.weight |
|
model.layers.23.mlp.gate_proj.weight |
|
model.layers.23.mlp.up_proj.weight |
|
model.layers.23.mlp.down_proj.weight |
|
model.layers.23.input_layernorm.weight |
|
model.layers.23.post_attention_layernorm.weight |
|
model.layers.24.self_attn.q_proj.weight |
|
model.layers.24.self_attn.k_proj.weight |
|
model.layers.24.self_attn.v_proj.weight |
|
model.layers.24.self_attn.o_proj.weight |
|
model.layers.24.mlp.gate_proj.weight |
|
model.layers.24.mlp.up_proj.weight |
|
model.layers.24.mlp.down_proj.weight |
|
model.layers.24.input_layernorm.weight |
|
model.layers.24.post_attention_layernorm.weight |
|
model.layers.25.self_attn.q_proj.weight |
|
model.layers.25.self_attn.k_proj.weight |
|
model.layers.25.self_attn.v_proj.weight |
|
model.layers.25.self_attn.o_proj.weight |
|
model.layers.25.mlp.gate_proj.weight |
|
model.layers.25.mlp.up_proj.weight |
|
model.layers.25.mlp.down_proj.weight |
|
model.layers.25.input_layernorm.weight |
|
model.layers.25.post_attention_layernorm.weight |
|
model.layers.26.self_attn.q_proj.weight |
|
model.layers.26.self_attn.k_proj.weight |
|
model.layers.26.self_attn.v_proj.weight |
|
model.layers.26.self_attn.o_proj.weight |
|
model.layers.26.mlp.gate_proj.weight |
|
model.layers.26.mlp.up_proj.weight |
|
model.layers.26.mlp.down_proj.weight |
|
model.layers.26.input_layernorm.weight |
|
model.layers.26.post_attention_layernorm.weight |
|
model.layers.27.self_attn.q_proj.weight |
|
model.layers.27.self_attn.k_proj.weight |
|
model.layers.27.self_attn.v_proj.weight |
|
model.layers.27.self_attn.o_proj.weight |
|
model.layers.27.mlp.gate_proj.weight |
|
model.layers.27.mlp.up_proj.weight |
|
model.layers.27.mlp.down_proj.weight |
|
model.layers.27.input_layernorm.weight |
|
model.layers.27.post_attention_layernorm.weight |
|
model.layers.28.self_attn.q_proj.weight |
|
model.layers.28.self_attn.k_proj.weight |
|
model.layers.28.self_attn.v_proj.weight |
|
model.layers.28.self_attn.o_proj.weight |
|
model.layers.28.mlp.gate_proj.weight |
|
model.layers.28.mlp.up_proj.weight |
|
model.layers.28.mlp.down_proj.weight |
|
model.layers.28.input_layernorm.weight |
|
model.layers.28.post_attention_layernorm.weight |
|
model.layers.29.self_attn.q_proj.weight |
|
model.layers.29.self_attn.k_proj.weight |
|
model.layers.29.self_attn.v_proj.weight |
|
model.layers.29.self_attn.o_proj.weight |
|
model.layers.29.mlp.gate_proj.weight |
|
model.layers.29.mlp.up_proj.weight |
|
model.layers.29.mlp.down_proj.weight |
|
model.layers.29.input_layernorm.weight |
|
model.layers.29.post_attention_layernorm.weight |
|
model.layers.30.self_attn.q_proj.weight |
|
model.layers.30.self_attn.k_proj.weight |
|
model.layers.30.self_attn.v_proj.weight |
|
model.layers.30.self_attn.o_proj.weight |
|
model.layers.30.mlp.gate_proj.weight |
|
model.layers.30.mlp.up_proj.weight |
|
model.layers.30.mlp.down_proj.weight |
|
model.layers.30.input_layernorm.weight |
|
model.layers.30.post_attention_layernorm.weight |
|
model.layers.31.self_attn.q_proj.weight |
|
model.layers.31.self_attn.k_proj.weight |
|
model.layers.31.self_attn.v_proj.weight |
|
model.layers.31.self_attn.o_proj.weight |
|
model.layers.31.mlp.gate_proj.weight |
|
model.layers.31.mlp.up_proj.weight |
|
model.layers.31.mlp.down_proj.weight |
|
model.layers.31.input_layernorm.weight |
|
model.layers.31.post_attention_layernorm.weight |
|
model.norm.weight |
|
model.mm_projector.0.weight |
|
model.mm_projector.0.bias |
|
model.mm_projector.2.weight |
|
model.mm_projector.2.bias |
|
lm_head.weight |
|
Installed CUDA version 12.3 does not match the version torch was compiled with 12.1 but since the APIs are compatible, accepting this combination |
|
Using /vol3/ctr/.cache/torch_extensions/py310_cu121 as PyTorch extensions root... |
|
Detected CUDA files, patching ldflags |
|
Emitting ninja build file /vol3/ctr/.cache/torch_extensions/py310_cu121/cpu_adam/build.ninja... |
|
Building extension module cpu_adam... |
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
ninja: no work to do. |
|
Loading extension module cpu_adam... |
|
Time to load cpu_adam op: 2.6055219173431396 seconds |
|
Installed CUDA version 12.3 does not match the version torch was compiled with 12.1 but since the APIs are compatible, accepting this combination |
|
Using /vol3/ctr/.cache/torch_extensions/py310_cu121 as PyTorch extensions root... |
|
Installed CUDA version 12.3 does not match the version torch was compiled with 12.1 but since the APIs are compatible, accepting this combination |
|
Using /vol3/ctr/.cache/torch_extensions/py310_cu121 as PyTorch extensions root... |
|
Installed CUDA version 12.3 does not match the version torch was compiled with 12.1 but since the APIs are compatible, accepting this combination |
|
Using /vol3/ctr/.cache/torch_extensions/py310_cu121 as PyTorch extensions root... |
|
Detected CUDA files, patching ldflags |
|
Emitting ninja build file /vol3/ctr/.cache/torch_extensions/py310_cu121/cpu_adam/build.ninja... |
|
Building extension module cpu_adam... |
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
Installed CUDA version 12.3 does not match the version torch was compiled with 12.1 but since the APIs are compatible, accepting this combination |
|
Using /vol3/ctr/.cache/torch_extensions/py310_cu121 as PyTorch extensions root... |
|
ninja: no work to do. |
|
Loading extension module cpu_adam... |
|
Time to load cpu_adam op: 2.6278350353240967 seconds |
|
Loading extension module cpu_adam... |
|
Loading extension module cpu_adam... |
|
Time to load cpu_adam op: 2.7238733768463135 seconds |
|
Time to load cpu_adam op: 2.71661114692688 seconds |
|
Loading extension module cpu_adam... |
|
Time to load cpu_adam op: 2.703078031539917 seconds |
|
Installed CUDA version 12.3 does not match the version torch was compiled with 12.1 but since the APIs are compatible, accepting this combination |
|
Using /vol3/ctr/.cache/torch_extensions/py310_cu121 as PyTorch extensions root... |
|
Detected CUDA files, patching ldflags |
|
Emitting ninja build file /vol3/ctr/.cache/torch_extensions/py310_cu121/cpu_adam/build.ninja... |
|
Building extension module cpu_adam... |
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
ninja: no work to do. |
|
Loading extension module cpu_adam... |
|
Time to load cpu_adam op: 2.6260738372802734 seconds |
|
Parameter Offload: Total persistent parameters: 599040 in 312 params |
|
wandb: Using wandb-core as the SDK backend. Please refer to https: |
|
wandb: Currently logged in as: s1820587. Use `wandb login --relogin` to force relogin |
|
wandb: - Waiting for wandb.init()...
wandb: \ Waiting for wandb.init()...
wandb: Tracking run with wandb version 0.18.7 |
|
wandb: Run data is saved locally in /vol3/home/ctr/llava-rlhf/LLaVA-REST-MCTS/models/LLaVA/wandb/run-20241204_141227-svgisw9q |
|
wandb: Run `wandb offline` to turn off syncing. |
|
wandb: Syncing run resilient-serenity-341 |
|
wandb: βοΈ View project at https: |
|
wandb: π View run at https: |
|
0%| | 0/267 [00:00<?, ?it/s]/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.) |
|
total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.) |
|
total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.) |
|
total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.) |
|
total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.) |
|
total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py:1330: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.) |
|
total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) |
|
0%| | 1/267 [00:37<2:48:14, 37.95s/it]
{'loss': 1.2395, 'learning_rate': 0.0, 'epoch': 0.01} |
|
0%| | 1/267 [00:37<2:48:14, 37.95s/it]
1%| | 2/267 [00:59<2:04:17, 28.14s/it]
{'loss': 1.2109, 'learning_rate': 1.0515495892857625e-06, 'epoch': 0.02} |
|
1%| | 2/267 [00:59<2:04:17, 28.14s/it]
1%| | 3/267 [01:19<1:49:00, 24.77s/it]
{'loss': 1.2171, 'learning_rate': 1.666666666666667e-06, 'epoch': 0.03} |
|
1%| | 3/267 [01:19<1:49:00, 24.77s/it]
1%|β | 4/267 [01:40<1:41:00, 23.05s/it]
{'loss': 1.1818, 'learning_rate': 2.103099178571525e-06, 'epoch': 0.04} |
|
1%|β | 4/267 [01:40<1:41:00, 23.05s/it]
2%|β | 5/267 [02:00<1:36:46, 22.16s/it]
{'loss': 1.1623, 'learning_rate': 2.4416225345298787e-06, 'epoch': 0.06} |
|
2%|β | 5/267 [02:00<1:36:46, 22.16s/it]
2%|β | 6/267 [02:21<1:34:07, 21.64s/it]
{'loss': 1.1307, 'learning_rate': 2.7182162559524295e-06, 'epoch': 0.07} |
|
2%|β | 6/267 [02:21<1:34:07, 21.64s/it]
3%|β | 7/267 [02:42<1:32:16, 21.29s/it]
{'loss': 1.1187, 'learning_rate': 2.9520729152690373e-06, 'epoch': 0.08} |
|
3%|β | 7/267 [02:42<1:32:16, 21.29s/it]
3%|β | 8/267 [03:02<1:30:39, 21.00s/it]
{'loss': 1.1144, 'learning_rate': 3.1546487678572874e-06, 'epoch': 0.09} |
|
3%|β | 8/267 [03:02<1:30:39, 21.00s/it]
3%|β | 9/267 [03:23<1:29:44, 20.87s/it]
{'loss': 1.0377, 'learning_rate': 3.333333333333334e-06, 'epoch': 0.1} |
|
3%|β | 9/267 [03:23<1:29:44, 20.87s/it]
4%|β | 10/267 [03:43<1:29:20, 20.86s/it]
{'loss': 1.0311, 'learning_rate': 3.493172123815642e-06, 'epoch': 0.11} |
|
4%|β | 10/267 [03:43<1:29:20, 20.86s/it]
4%|β | 11/267 [04:04<1:28:40, 20.78s/it]
{'loss': 1.0227, 'learning_rate': 3.637763897740231e-06, 'epoch': 0.12} |
|
4%|β | 11/267 [04:04<1:28:40, 20.78s/it]
4%|β | 12/267 [04:25<1:28:20, 20.79s/it]
{'loss': 1.0224, 'learning_rate': 3.769765845238192e-06, 'epoch': 0.13} |
|
4%|β | 12/267 [04:25<1:28:20, 20.79s/it]
5%|β | 13/267 [04:45<1:27:37, 20.70s/it]
{'loss': 0.9782, 'learning_rate': 3.891195865787989e-06, 'epoch': 0.15} |
|
5%|β | 13/267 [04:45<1:27:37, 20.70s/it]
5%|β | 14/267 [05:06<1:27:02, 20.64s/it]
{'loss': 0.981, 'learning_rate': 4.003622504554799e-06, 'epoch': 0.16} |
|
5%|β | 14/267 [05:06<1:27:02, 20.64s/it]
6%|β | 15/267 [05:26<1:26:13, 20.53s/it]
{'loss': 0.9871, 'learning_rate': 4.108289201196546e-06, 'epoch': 0.17} |
|
6%|β | 15/267 [05:26<1:26:13, 20.53s/it]
6%|β | 16/267 [05:47<1:25:57, 20.55s/it]
{'loss': 0.9874, 'learning_rate': 4.20619835714305e-06, 'epoch': 0.18} |
|
6%|β | 16/267 [05:47<1:25:57, 20.55s/it]
6%|β | 17/267 [06:07<1:25:42, 20.57s/it]
{'loss': 0.9525, 'learning_rate': 4.29816987193761e-06, 'epoch': 0.19} |
|
6%|β | 17/267 [06:07<1:25:42, 20.57s/it]
7%|β | 18/267 [06:28<1:25:13, 20.54s/it]
{'loss': 0.9614, 'learning_rate': 4.384882922619096e-06, 'epoch': 0.2} |
|
7%|β | 18/267 [06:28<1:25:13, 20.54s/it]
7%|β | 19/267 [06:48<1:24:48, 20.52s/it]
{'loss': 0.9527, 'learning_rate': 4.466906432077293e-06, 'epoch': 0.21} |
|
7%|β | 19/267 [06:48<1:24:48, 20.52s/it]
7%|β | 20/267 [07:09<1:24:16, 20.47s/it]
{'loss': 0.9404, 'learning_rate': 4.5447217131014036e-06, 'epoch': 0.22} |
|
7%|β | 20/267 [07:09<1:24:16, 20.47s/it]
8%|β | 21/267 [07:29<1:24:04, 20.50s/it]
{'loss': 0.9461, 'learning_rate': 4.618739581935704e-06, 'epoch': 0.24} |
|
8%|β | 21/267 [07:29<1:24:04, 20.50s/it]
8%|β | 22/267 [07:50<1:23:30, 20.45s/it]
{'loss': 0.8768, 'learning_rate': 4.689313487025993e-06, 'epoch': 0.25} |
|
8%|β | 22/267 [07:50<1:23:30, 20.45s/it]
9%|β | 23/267 [08:10<1:23:14, 20.47s/it]
{'loss': 0.8974, 'learning_rate': 4.756749717000453e-06, 'epoch': 0.26} |
|
9%|β | 23/267 [08:10<1:23:14, 20.47s/it]
9%|β | 24/267 [08:31<1:22:55, 20.48s/it]
{'loss': 0.9084, 'learning_rate': 4.821315434523955e-06, 'epoch': 0.27} |
|
9%|β | 24/267 [08:31<1:22:55, 20.48s/it]
9%|β | 25/267 [08:51<1:22:36, 20.48s/it]
{'loss': 0.9043, 'learning_rate': 4.883245069059757e-06, 'epoch': 0.28} |
|
9%|β | 25/267 [08:51<1:22:36, 20.48s/it]
10%|β | 26/267 [09:12<1:22:13, 20.47s/it]
{'loss': 0.9133, 'learning_rate': 4.942745455073751e-06, 'epoch': 0.29} |
|
10%|β | 26/267 [09:12<1:22:13, 20.47s/it]
10%|β | 27/267 [09:32<1:22:16, 20.57s/it]
{'loss': 0.8901, 'learning_rate': 5e-06, 'epoch': 0.3} |
|
10%|β | 27/267 [09:32<1:22:16, 20.57s/it]
10%|β | 28/267 [09:53<1:21:37, 20.49s/it]
{'loss': 0.8853, 'learning_rate': 5e-06, 'epoch': 0.31} |
|
10%|β | 28/267 [09:53<1:21:37, 20.49s/it]
11%|β | 29/267 [10:13<1:21:12, 20.47s/it]
{'loss': 0.9018, 'learning_rate': 5e-06, 'epoch': 0.33} |
|
11%|β | 29/267 [10:13<1:21:12, 20.47s/it]
11%|β | 30/267 [10:33<1:20:48, 20.46s/it]
{'loss': 0.8816, 'learning_rate': 5e-06, 'epoch': 0.34} |
|
11%|β | 30/267 [10:33<1:20:48, 20.46s/it]
12%|ββ | 31/267 [10:54<1:20:32, 20.48s/it]
{'loss': 0.8913, 'learning_rate': 5e-06, 'epoch': 0.35} |
|
12%|ββ | 31/267 [10:54<1:20:32, 20.48s/it]
12%|ββ | 32/267 [11:14<1:20:04, 20.44s/it]
{'loss': 0.8977, 'learning_rate': 5e-06, 'epoch': 0.36} |
|
12%|ββ | 32/267 [11:14<1:20:04, 20.44s/it]
12%|ββ | 33/267 [11:35<1:19:58, 20.51s/it]
{'loss': 0.8924, 'learning_rate': 5e-06, 'epoch': 0.37} |
|
12%|ββ | 33/267 [11:35<1:19:58, 20.51s/it]
13%|ββ | 34/267 [11:55<1:19:33, 20.49s/it]
{'loss': 0.8968, 'learning_rate': 5e-06, 'epoch': 0.38} |
|
13%|ββ | 34/267 [11:55<1:19:33, 20.49s/it]
13%|ββ | 35/267 [12:16<1:19:10, 20.47s/it]
{'loss': 0.8943, 'learning_rate': 5e-06, 'epoch': 0.39} |
|
13%|ββ | 35/267 [12:16<1:19:10, 20.47s/it]
13%|ββ | 36/267 [12:36<1:18:47, 20.46s/it]
{'loss': 0.8473, 'learning_rate': 5e-06, 'epoch': 0.4} |
|
13%|ββ | 36/267 [12:36<1:18:47, 20.46s/it]
14%|ββ | 37/267 [12:57<1:18:14, 20.41s/it]
{'loss': 0.8422, 'learning_rate': 5e-06, 'epoch': 0.42} |
|
14%|ββ | 37/267 [12:57<1:18:14, 20.41s/it]
14%|ββ | 38/267 [13:17<1:17:50, 20.40s/it]
{'loss': 0.8431, 'learning_rate': 5e-06, 'epoch': 0.43} |
|
14%|ββ | 38/267 [13:17<1:17:50, 20.40s/it]
15%|ββ | 39/267 [13:38<1:17:45, 20.46s/it]
{'loss': 0.881, 'learning_rate': 5e-06, 'epoch': 0.44} |
|
15%|ββ | 39/267 [13:38<1:17:45, 20.46s/it]
15%|ββ | 40/267 [13:59<1:18:46, 20.82s/it]
{'loss': 0.8746, 'learning_rate': 5e-06, 'epoch': 0.45} |
|
15%|ββ | 40/267 [13:59<1:18:46, 20.82s/it]
15%|ββ | 41/267 [14:20<1:18:01, 20.71s/it]
{'loss': 0.8698, 'learning_rate': 5e-06, 'epoch': 0.46} |
|
15%|ββ | 41/267 [14:20<1:18:01, 20.71s/it]
16%|ββ | 42/267 [14:40<1:17:20, 20.62s/it]
{'loss': 0.8539, 'learning_rate': 5e-06, 'epoch': 0.47} |
|
16%|ββ | 42/267 [14:40<1:17:20, 20.62s/it]
16%|ββ | 43/267 [15:01<1:16:47, 20.57s/it]
{'loss': 0.8405, 'learning_rate': 5e-06, 'epoch': 0.48} |
|
16%|ββ | 43/267 [15:01<1:16:47, 20.57s/it]
16%|ββ | 44/267 [15:21<1:16:21, 20.54s/it]
{'loss': 0.8629, 'learning_rate': 5e-06, 'epoch': 0.49} |
|
16%|ββ | 44/267 [15:21<1:16:21, 20.54s/it]
17%|ββ | 45/267 [15:42<1:15:54, 20.51s/it]
{'loss': 0.8723, 'learning_rate': 5e-06, 'epoch': 0.51} |
|
17%|ββ | 45/267 [15:42<1:15:54, 20.51s/it]
17%|ββ | 46/267 [16:02<1:15:37, 20.53s/it]
{'loss': 0.8686, 'learning_rate': 5e-06, 'epoch': 0.52} |
|
17%|ββ | 46/267 [16:02<1:15:37, 20.53s/it]
18%|ββ | 47/267 [16:23<1:15:13, 20.52s/it]
{'loss': 0.8587, 'learning_rate': 5e-06, 'epoch': 0.53} |
|
18%|ββ | 47/267 [16:23<1:15:13, 20.52s/it]
18%|ββ | 48/267 [16:43<1:14:46, 20.48s/it]
{'loss': 0.8751, 'learning_rate': 5e-06, 'epoch': 0.54} |
|
18%|ββ | 48/267 [16:43<1:14:46, 20.48s/it]
18%|ββ | 49/267 [17:04<1:15:29, 20.78s/it]
{'loss': 0.8564, 'learning_rate': 5e-06, 'epoch': 0.55} |
|
18%|ββ | 49/267 [17:04<1:15:29, 20.78s/it]
19%|ββ | 50/267 [17:25<1:14:39, 20.65s/it]
{'loss': 0.8286, 'learning_rate': 5e-06, 'epoch': 0.56} |
|
19%|ββ | 50/267 [17:25<1:14:39, 20.65s/it]
19%|ββ | 51/267 [17:45<1:13:53, 20.53s/it]
{'loss': 0.8606, 'learning_rate': 5e-06, 'epoch': 0.57} |
|
19%|ββ | 51/267 [17:45<1:13:53, 20.53s/it]
19%|ββ | 52/267 [18:05<1:13:23, 20.48s/it]
{'loss': 0.8463, 'learning_rate': 5e-06, 'epoch': 0.58} |
|
19%|ββ | 52/267 [18:05<1:13:23, 20.48s/it]
20%|ββ | 53/267 [18:26<1:12:58, 20.46s/it]
{'loss': 0.8392, 'learning_rate': 5e-06, 'epoch': 0.6} |
|
20%|ββ | 53/267 [18:26<1:12:58, 20.46s/it]
20%|ββ | 54/267 [18:46<1:12:23, 20.39s/it]
{'loss': 0.8416, 'learning_rate': 5e-06, 'epoch': 0.61} |
|
20%|ββ | 54/267 [18:46<1:12:23, 20.39s/it]
21%|ββ | 55/267 [19:07<1:12:08, 20.42s/it]
{'loss': 0.8546, 'learning_rate': 5e-06, 'epoch': 0.62} |
|
21%|ββ | 55/267 [19:07<1:12:08, 20.42s/it]
21%|ββ | 56/267 [19:27<1:12:02, 20.49s/it]
{'loss': 0.8399, 'learning_rate': 5e-06, 'epoch': 0.63} |
|
21%|ββ | 56/267 [19:27<1:12:02, 20.49s/it]
21%|βββ | 57/267 [19:48<1:11:42, 20.49s/it]
{'loss': 0.8406, 'learning_rate': 5e-06, 'epoch': 0.64} |
|
21%|βββ | 57/267 [19:48<1:11:42, 20.49s/it]
22%|βββ | 58/267 [20:09<1:12:22, 20.78s/it]
{'loss': 0.8361, 'learning_rate': 5e-06, 'epoch': 0.65} |
|
22%|βββ | 58/267 [20:09<1:12:22, 20.78s/it]
22%|βββ | 59/267 [20:30<1:11:38, 20.67s/it]
{'loss': 0.835, 'learning_rate': 5e-06, 'epoch': 0.66} |
|
22%|βββ | 59/267 [20:30<1:11:38, 20.67s/it]
22%|βββ | 60/267 [20:50<1:10:57, 20.57s/it]
{'loss': 0.8269, 'learning_rate': 5e-06, 'epoch': 0.67} |
|
22%|βββ | 60/267 [20:50<1:10:57, 20.57s/it]
23%|βββ | 61/267 [21:11<1:11:36, 20.86s/it]
{'loss': 0.8191, 'learning_rate': 5e-06, 'epoch': 0.69} |
|
23%|βββ | 61/267 [21:11<1:11:36, 20.86s/it]
23%|βββ | 62/267 [21:32<1:10:54, 20.75s/it]
{'loss': 0.8406, 'learning_rate': 5e-06, 'epoch': 0.7} |
|
23%|βββ | 62/267 [21:32<1:10:54, 20.75s/it]
24%|βββ | 63/267 [21:52<1:09:50, 20.54s/it]
{'loss': 0.8486, 'learning_rate': 5e-06, 'epoch': 0.71} |
|
24%|βββ | 63/267 [21:52<1:09:50, 20.54s/it]
24%|βββ | 64/267 [22:12<1:09:15, 20.47s/it]
{'loss': 0.8433, 'learning_rate': 5e-06, 'epoch': 0.72} |
|
24%|βββ | 64/267 [22:12<1:09:15, 20.47s/it]
24%|βββ | 65/267 [22:33<1:08:55, 20.47s/it]
{'loss': 0.8283, 'learning_rate': 5e-06, 'epoch': 0.73} |
|
24%|βββ | 65/267 [22:33<1:08:55, 20.47s/it]
25%|βββ | 66/267 [22:54<1:09:10, 20.65s/it]
{'loss': 0.8113, 'learning_rate': 5e-06, 'epoch': 0.74} |
|
25%|βββ | 66/267 [22:54<1:09:10, 20.65s/it]
25%|βββ | 67/267 [23:14<1:08:36, 20.58s/it]
{'loss': 0.8228, 'learning_rate': 5e-06, 'epoch': 0.75} |
|
25%|βββ | 67/267 [23:14<1:08:36, 20.58s/it]
25%|βββ | 68/267 [23:35<1:08:00, 20.50s/it]
{'loss': 0.8351, 'learning_rate': 5e-06, 'epoch': 0.76} |
|
25%|βββ | 68/267 [23:35<1:08:00, 20.50s/it]
26%|βββ | 69/267 [23:55<1:07:24, 20.43s/it]
{'loss': 0.824, 'learning_rate': 5e-06, 'epoch': 0.78} |
|
26%|βββ | 69/267 [23:55<1:07:24, 20.43s/it]
26%|βββ | 70/267 [24:15<1:06:57, 20.39s/it]
{'loss': 0.8354, 'learning_rate': 5e-06, 'epoch': 0.79} |
|
26%|βββ | 70/267 [24:15<1:06:57, 20.39s/it]
27%|βββ | 71/267 [24:35<1:06:23, 20.33s/it]
{'loss': 0.817, 'learning_rate': 5e-06, 'epoch': 0.8} |
|
27%|βββ | 71/267 [24:35<1:06:23, 20.33s/it]
27%|βββ | 72/267 [24:56<1:05:59, 20.31s/it]
{'loss': 0.8271, 'learning_rate': 5e-06, 'epoch': 0.81} |
|
27%|βββ | 72/267 [24:56<1:05:59, 20.31s/it]
27%|βββ | 73/267 [25:16<1:05:31, 20.27s/it]
{'loss': 0.8221, 'learning_rate': 5e-06, 'epoch': 0.82} |
|
27%|βββ | 73/267 [25:16<1:05:31, 20.27s/it]
28%|βββ | 74/267 [25:36<1:05:14, 20.28s/it]
{'loss': 0.8458, 'learning_rate': 5e-06, 'epoch': 0.83} |
|
28%|βββ | 74/267 [25:36<1:05:14, 20.28s/it]
28%|βββ | 75/267 [25:56<1:04:56, 20.29s/it]
{'loss': 0.8547, 'learning_rate': 5e-06, 'epoch': 0.84} |
|
28%|βββ | 75/267 [25:56<1:04:56, 20.29s/it][2024-12-04 14:38:46,068] [WARNING] [stage3.py:1991:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time |
|
28%|βββ | 76/267 [26:17<1:04:59, 20.42s/it]
{'loss': 0.8249, 'learning_rate': 5e-06, 'epoch': 0.85} |
|
28%|βββ | 76/267 [26:17<1:04:59, 20.42s/it]
29%|βββ | 77/267 [26:37<1:04:40, 20.42s/it]
{'loss': 0.8228, 'learning_rate': 5e-06, 'epoch': 0.87} |
|
29%|βββ | 77/267 [26:37<1:04:40, 20.42s/it]
29%|βββ | 78/267 [26:58<1:04:06, 20.35s/it]
{'loss': 0.8407, 'learning_rate': 5e-06, 'epoch': 0.88} |
|
29%|βββ | 78/267 [26:58<1:04:06, 20.35s/it]
30%|βββ | 79/267 [27:18<1:03:38, 20.31s/it]
{'loss': 0.8271, 'learning_rate': 5e-06, 'epoch': 0.89} |
|
30%|βββ | 79/267 [27:18<1:03:38, 20.31s/it]
30%|βββ | 80/267 [27:38<1:03:09, 20.27s/it]
{'loss': 0.8199, 'learning_rate': 5e-06, 'epoch': 0.9} |
|
30%|βββ | 80/267 [27:38<1:03:09, 20.27s/it]
30%|βββ | 81/267 [27:58<1:02:40, 20.22s/it]
{'loss': 0.8325, 'learning_rate': 5e-06, 'epoch': 0.91} |
|
30%|βββ | 81/267 [27:58<1:02:40, 20.22s/it]
31%|βββ | 82/267 [28:18<1:02:24, 20.24s/it]
{'loss': 0.8134, 'learning_rate': 5e-06, 'epoch': 0.92} |
|
31%|βββ | 82/267 [28:18<1:02:24, 20.24s/it]
31%|βββ | 83/267 [28:39<1:02:06, 20.25s/it]
{'loss': 0.8275, 'learning_rate': 5e-06, 'epoch': 0.93} |
|
31%|βββ | 83/267 [28:39<1:02:06, 20.25s/it]
31%|ββββ | 84/267 [28:59<1:01:33, 20.19s/it]
{'loss': 0.8296, 'learning_rate': 5e-06, 'epoch': 0.94} |
|
31%|ββββ | 84/267 [28:59<1:01:33, 20.19s/it]
32%|ββββ | 85/267 [29:19<1:01:10, 20.17s/it]
{'loss': 0.8241, 'learning_rate': 5e-06, 'epoch': 0.96} |
|
32%|ββββ | 85/267 [29:19<1:01:10, 20.17s/it]
32%|ββββ | 86/267 [29:39<1:00:52, 20.18s/it]
{'loss': 0.8329, 'learning_rate': 5e-06, 'epoch': 0.97} |
|
32%|ββββ | 86/267 [29:39<1:00:52, 20.18s/it]
33%|ββββ | 87/267 [30:00<1:01:04, 20.36s/it]
{'loss': 0.8399, 'learning_rate': 5e-06, 'epoch': 0.98} |
|
33%|ββββ | 87/267 [30:00<1:01:04, 20.36s/it]
33%|ββββ | 88/267 [30:20<1:00:26, 20.26s/it]
{'loss': 0.8028, 'learning_rate': 5e-06, 'epoch': 0.99} |
|
33%|ββββ | 88/267 [30:20<1:00:26, 20.26s/it]
33%|ββββ | 89/267 [30:43<1:02:42, 21.14s/it]
{'loss': 0.7865, 'learning_rate': 5e-06, 'epoch': 1.0} |
|
33%|ββββ | 89/267 [30:43<1:02:42, 21.14s/it]
34%|ββββ | 90/267 [31:16<1:13:08, 24.79s/it]
{'loss': 0.7275, 'learning_rate': 5e-06, 'epoch': 1.01} |
|
34%|ββββ | 90/267 [31:16<1:13:08, 24.79s/it]
34%|ββββ | 91/267 [31:37<1:08:53, 23.49s/it]
{'loss': 0.7466, 'learning_rate': 5e-06, 'epoch': 1.02} |
|
34%|ββββ | 91/267 [31:37<1:08:53, 23.49s/it]
34%|ββββ | 92/267 [31:57<1:05:32, 22.47s/it]
{'loss': 0.7358, 'learning_rate': 5e-06, 'epoch': 1.03} |
|
34%|ββββ | 92/267 [31:57<1:05:32, 22.47s/it]
35%|ββββ | 93/267 [32:17<1:03:12, 21.80s/it]
{'loss': 0.7381, 'learning_rate': 5e-06, 'epoch': 1.04} |
|
35%|ββββ | 93/267 [32:17<1:03:12, 21.80s/it]
35%|ββββ | 94/267 [32:37<1:01:34, 21.35s/it]
{'loss': 0.7502, 'learning_rate': 5e-06, 'epoch': 1.06} |
|
35%|ββββ | 94/267 [32:37<1:01:34, 21.35s/it]
36%|ββββ | 95/267 [32:58<1:00:22, 21.06s/it]
{'loss': 0.7431, 'learning_rate': 5e-06, 'epoch': 1.07} |
|
36%|ββββ | 95/267 [32:58<1:00:22, 21.06s/it]
36%|ββββ | 96/267 [33:18<59:38, 20.93s/it]
{'loss': 0.733, 'learning_rate': 5e-06, 'epoch': 1.08} |
|
36%|ββββ | 96/267 [33:18<59:38, 20.93s/it]
36%|ββββ | 97/267 [33:39<59:01, 20.83s/it]
{'loss': 0.7073, 'learning_rate': 5e-06, 'epoch': 1.09} |
|
36%|ββββ | 97/267 [33:39<59:01, 20.83s/it]
37%|ββββ | 98/267 [33:59<58:16, 20.69s/it]
{'loss': 0.7287, 'learning_rate': 5e-06, 'epoch': 1.1} |
|
37%|ββββ | 98/267 [33:59<58:16, 20.69s/it]
37%|ββββ | 99/267 [34:20<57:39, 20.59s/it]
{'loss': 0.7415, 'learning_rate': 5e-06, 'epoch': 1.11} |
|
37%|ββββ | 99/267 [34:20<57:39, 20.59s/it]
37%|ββββ | 100/267 [34:40<57:06, 20.52s/it]
{'loss': 0.7511, 'learning_rate': 5e-06, 'epoch': 1.12} |
|
37%|ββββ | 100/267 [34:40<57:06, 20.52s/it]Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41. |
|
Non-default generation parameters: {'max_length': 4096} |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https: |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https: |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https: |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https: |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https: |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https: |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
38%|ββββ | 101/267 [36:22<2:04:17, 44.92s/it]
{'loss': 0.7342, 'learning_rate': 5e-06, 'epoch': 1.13} |
|
38%|ββββ | 101/267 [36:22<2:04:17, 44.92s/it]
38%|ββββ | 102/267 [36:42<1:43:09, 37.51s/it]
{'loss': 0.7284, 'learning_rate': 5e-06, 'epoch': 1.15} |
|
38%|ββββ | 102/267 [36:42<1:43:09, 37.51s/it]
39%|ββββ | 103/267 [37:03<1:28:26, 32.36s/it]
{'loss': 0.7386, 'learning_rate': 5e-06, 'epoch': 1.16} |
|
39%|ββββ | 103/267 [37:03<1:28:26, 32.36s/it]
39%|ββββ | 104/267 [37:22<1:17:45, 28.62s/it]
{'loss': 0.7496, 'learning_rate': 5e-06, 'epoch': 1.17} |
|
39%|ββββ | 104/267 [37:22<1:17:45, 28.62s/it]
39%|ββββ | 105/267 [37:43<1:10:31, 26.12s/it]
{'loss': 0.7251, 'learning_rate': 5e-06, 'epoch': 1.18} |
|
39%|ββββ | 105/267 [37:43<1:10:31, 26.12s/it]
40%|ββββ | 106/267 [38:03<1:05:04, 24.25s/it]
{'loss': 0.724, 'learning_rate': 5e-06, 'epoch': 1.19} |
|
40%|ββββ | 106/267 [38:03<1:05:04, 24.25s/it]
40%|ββββ | 107/267 [38:23<1:01:25, 23.04s/it]
{'loss': 0.7323, 'learning_rate': 5e-06, 'epoch': 1.2} |
|
40%|ββββ | 107/267 [38:23<1:01:25, 23.04s/it]
40%|ββββ | 108/267 [38:43<58:35, 22.11s/it]
{'loss': 0.7379, 'learning_rate': 5e-06, 'epoch': 1.21} |
|
40%|ββββ | 108/267 [38:43<58:35, 22.11s/it]
41%|ββββ | 109/267 [39:03<56:42, 21.53s/it]
{'loss': 0.7255, 'learning_rate': 5e-06, 'epoch': 1.22} |
|
41%|ββββ | 109/267 [39:03<56:42, 21.53s/it]
41%|ββββ | 110/267 [39:23<55:03, 21.04s/it]
{'loss': 0.7535, 'learning_rate': 5e-06, 'epoch': 1.24} |
|
41%|ββββ | 110/267 [39:23<55:03, 21.04s/it]
42%|βββββ | 111/267 [39:43<54:02, 20.78s/it]
{'loss': 0.7613, 'learning_rate': 5e-06, 'epoch': 1.25} |
|
42%|βββββ | 111/267 [39:43<54:02, 20.78s/it]
42%|βββββ | 112/267 [40:03<53:00, 20.52s/it]
{'loss': 0.7386, 'learning_rate': 5e-06, 'epoch': 1.26} |
|
42%|βββββ | 112/267 [40:03<53:00, 20.52s/it]
42%|βββββ | 113/267 [40:23<52:12, 20.34s/it]
{'loss': 0.7346, 'learning_rate': 5e-06, 'epoch': 1.27} |
|
42%|βββββ | 113/267 [40:23<52:12, 20.34s/it]
43%|βββββ | 114/267 [40:43<51:39, 20.26s/it]
{'loss': 0.7472, 'learning_rate': 5e-06, 'epoch': 1.28} |
|
43%|βββββ | 114/267 [40:43<51:39, 20.26s/it]
43%|βββββ | 115/267 [41:03<51:13, 20.22s/it]
{'loss': 0.7364, 'learning_rate': 5e-06, 'epoch': 1.29} |
|
43%|βββββ | 115/267 [41:03<51:13, 20.22s/it]
43%|βββββ | 116/267 [41:23<50:51, 20.21s/it]
{'loss': 0.7352, 'learning_rate': 5e-06, 'epoch': 1.3} |
|
43%|βββββ | 116/267 [41:23<50:51, 20.21s/it]
44%|βββββ | 117/267 [41:43<50:03, 20.02s/it]
{'loss': 0.7231, 'learning_rate': 5e-06, 'epoch': 1.31} |
|
44%|βββββ | 117/267 [41:43<50:03, 20.02s/it]
44%|βββββ | 118/267 [42:03<49:38, 19.99s/it]
{'loss': 0.7409, 'learning_rate': 5e-06, 'epoch': 1.33} |
|
44%|βββββ | 118/267 [42:03<49:38, 19.99s/it]
45%|βββββ | 119/267 [42:23<49:11, 19.94s/it]
{'loss': 0.7339, 'learning_rate': 5e-06, 'epoch': 1.34} |
|
45%|βββββ | 119/267 [42:23<49:11, 19.94s/it]
45%|βββββ | 120/267 [42:43<49:03, 20.02s/it]
{'loss': 0.7184, 'learning_rate': 5e-06, 'epoch': 1.35} |
|
45%|βββββ | 120/267 [42:43<49:03, 20.02s/it]
45%|βββββ | 121/267 [43:03<48:41, 20.01s/it]
{'loss': 0.7254, 'learning_rate': 5e-06, 'epoch': 1.36} |
|
45%|βββββ | 121/267 [43:03<48:41, 20.01s/it]
46%|βββββ | 122/267 [43:23<48:25, 20.03s/it]
{'loss': 0.7193, 'learning_rate': 5e-06, 'epoch': 1.37} |
|
46%|βββββ | 122/267 [43:23<48:25, 20.03s/it]
46%|βββββ | 123/267 [43:43<48:05, 20.04s/it]
{'loss': 0.7309, 'learning_rate': 5e-06, 'epoch': 1.38} |
|
46%|βββββ | 123/267 [43:43<48:05, 20.04s/it]
46%|βββββ | 124/267 [44:03<47:39, 19.99s/it]
{'loss': 0.7609, 'learning_rate': 5e-06, 'epoch': 1.39} |
|
46%|βββββ | 124/267 [44:03<47:39, 19.99s/it]
47%|βββββ | 125/267 [44:23<47:13, 19.95s/it]
{'loss': 0.7334, 'learning_rate': 5e-06, 'epoch': 1.4} |
|
47%|βββββ | 125/267 [44:23<47:13, 19.95s/it]
47%|βββββ | 126/267 [44:43<46:54, 19.96s/it]
{'loss': 0.7363, 'learning_rate': 5e-06, 'epoch': 1.42} |
|
47%|βββββ | 126/267 [44:43<46:54, 19.96s/it]
48%|βββββ | 127/267 [45:03<46:31, 19.94s/it]
{'loss': 0.7294, 'learning_rate': 5e-06, 'epoch': 1.43} |
|
48%|βββββ | 127/267 [45:03<46:31, 19.94s/it]
48%|βββββ | 128/267 [45:23<46:12, 19.94s/it]
{'loss': 0.732, 'learning_rate': 5e-06, 'epoch': 1.44} |
|
48%|βββββ | 128/267 [45:23<46:12, 19.94s/it]
48%|βββββ | 129/267 [45:43<45:58, 19.99s/it]
{'loss': 0.7407, 'learning_rate': 5e-06, 'epoch': 1.45} |
|
48%|βββββ | 129/267 [45:43<45:58, 19.99s/it]
49%|βββββ | 130/267 [46:03<45:36, 19.97s/it]
{'loss': 0.7171, 'learning_rate': 5e-06, 'epoch': 1.46} |
|
49%|βββββ | 130/267 [46:03<45:36, 19.97s/it]
49%|βββββ | 131/267 [46:23<45:26, 20.05s/it]
{'loss': 0.7301, 'learning_rate': 5e-06, 'epoch': 1.47} |
|
49%|βββββ | 131/267 [46:23<45:26, 20.05s/it]
49%|βββββ | 132/267 [46:43<44:57, 19.98s/it]
{'loss': 0.7126, 'learning_rate': 5e-06, 'epoch': 1.48} |
|
49%|βββββ | 132/267 [46:43<44:57, 19.98s/it]
50%|βββββ | 133/267 [47:03<44:36, 19.98s/it]
{'loss': 0.7067, 'learning_rate': 5e-06, 'epoch': 1.49} |
|
50%|βββββ | 133/267 [47:03<44:36, 19.98s/it]
50%|βββββ | 134/267 [47:23<44:19, 20.00s/it]
{'loss': 0.7291, 'learning_rate': 5e-06, 'epoch': 1.51} |
|
50%|βββββ | 134/267 [47:23<44:19, 20.00s/it]
51%|βββββ | 135/267 [47:43<43:57, 19.98s/it]
{'loss': 0.734, 'learning_rate': 5e-06, 'epoch': 1.52} |
|
51%|βββββ | 135/267 [47:43<43:57, 19.98s/it]
51%|βββββ | 136/267 [48:03<43:42, 20.02s/it]
{'loss': 0.7242, 'learning_rate': 5e-06, 'epoch': 1.53} |
|
51%|βββββ | 136/267 [48:03<43:42, 20.02s/it]
51%|ββββββ | 137/267 [48:23<43:17, 19.98s/it]
{'loss': 0.7328, 'learning_rate': 5e-06, 'epoch': 1.54} |
|
51%|ββββββ | 137/267 [48:23<43:17, 19.98s/it]
52%|ββββββ | 138/267 [48:43<42:59, 20.00s/it]
{'loss': 0.7544, 'learning_rate': 5e-06, 'epoch': 1.55} |
|
52%|ββββββ | 138/267 [48:43<42:59, 20.00s/it]
52%|ββββββ | 139/267 [49:03<42:42, 20.02s/it]
{'loss': 0.7249, 'learning_rate': 5e-06, 'epoch': 1.56} |
|
52%|ββββββ | 139/267 [49:03<42:42, 20.02s/it]
52%|ββββββ | 140/267 [49:23<42:31, 20.09s/it]
{'loss': 0.7123, 'learning_rate': 5e-06, 'epoch': 1.57} |
|
52%|ββββββ | 140/267 [49:23<42:31, 20.09s/it]
53%|ββββββ | 141/267 [49:43<42:07, 20.06s/it]
{'loss': 0.7262, 'learning_rate': 5e-06, 'epoch': 1.58} |
|
53%|ββββββ | 141/267 [49:43<42:07, 20.06s/it]
53%|ββββββ | 142/267 [50:03<41:42, 20.02s/it]
{'loss': 0.7297, 'learning_rate': 5e-06, 'epoch': 1.6} |
|
53%|ββββββ | 142/267 [50:03<41:42, 20.02s/it]
54%|ββββββ | 143/267 [50:23<41:26, 20.06s/it]
{'loss': 0.7175, 'learning_rate': 5e-06, 'epoch': 1.61} |
|
54%|ββββββ | 143/267 [50:23<41:26, 20.06s/it]
54%|ββββββ | 144/267 [50:43<41:07, 20.06s/it]
{'loss': 0.7, 'learning_rate': 5e-06, 'epoch': 1.62} |
|
54%|ββββββ | 144/267 [50:43<41:07, 20.06s/it]
54%|ββββββ | 145/267 [51:03<40:53, 20.11s/it]
{'loss': 0.7228, 'learning_rate': 5e-06, 'epoch': 1.63} |
|
54%|ββββββ | 145/267 [51:03<40:53, 20.11s/it]
55%|ββββββ | 146/267 [51:23<40:24, 20.04s/it]
{'loss': 0.7125, 'learning_rate': 5e-06, 'epoch': 1.64} |
|
55%|ββββββ | 146/267 [51:23<40:24, 20.04s/it]
55%|ββββββ | 147/267 [51:43<40:09, 20.08s/it]
{'loss': 0.7229, 'learning_rate': 5e-06, 'epoch': 1.65} |
|
55%|ββββββ | 147/267 [51:43<40:09, 20.08s/it]
55%|ββββββ | 148/267 [52:03<39:46, 20.06s/it]
{'loss': 0.7216, 'learning_rate': 5e-06, 'epoch': 1.66} |
|
55%|ββββββ | 148/267 [52:03<39:46, 20.06s/it]
56%|ββββββ | 149/267 [52:23<39:24, 20.04s/it]
{'loss': 0.7343, 'learning_rate': 5e-06, 'epoch': 1.67} |
|
56%|ββββββ | 149/267 [52:23<39:24, 20.04s/it]
56%|ββββββ | 150/267 [52:43<39:00, 20.00s/it]
{'loss': 0.706, 'learning_rate': 5e-06, 'epoch': 1.69} |
|
56%|ββββββ | 150/267 [52:43<39:00, 20.00s/it]
57%|ββββββ | 151/267 [53:03<38:42, 20.02s/it]
{'loss': 0.7111, 'learning_rate': 5e-06, 'epoch': 1.7} |
|
57%|ββββββ | 151/267 [53:03<38:42, 20.02s/it]
57%|ββββββ | 152/267 [53:23<38:16, 19.97s/it]
{'loss': 0.7305, 'learning_rate': 5e-06, 'epoch': 1.71} |
|
57%|ββββββ | 152/267 [53:23<38:16, 19.97s/it]
57%|ββββββ | 153/267 [53:43<37:58, 19.99s/it]
{'loss': 0.7272, 'learning_rate': 5e-06, 'epoch': 1.72} |
|
57%|ββββββ | 153/267 [53:43<37:58, 19.99s/it]
58%|ββββββ | 154/267 [54:03<37:37, 19.98s/it]
{'loss': 0.7374, 'learning_rate': 5e-06, 'epoch': 1.73} |
|
58%|ββββββ | 154/267 [54:03<37:37, 19.98s/it]
58%|ββββββ | 155/267 [54:23<37:17, 19.97s/it]
{'loss': 0.7287, 'learning_rate': 5e-06, 'epoch': 1.74} |
|
58%|ββββββ | 155/267 [54:23<37:17, 19.97s/it]
58%|ββββββ | 156/267 [54:43<36:52, 19.93s/it]
{'loss': 0.7277, 'learning_rate': 5e-06, 'epoch': 1.75} |
|
58%|ββββββ | 156/267 [54:43<36:52, 19.93s/it]
59%|ββββββ | 157/267 [55:03<36:30, 19.91s/it]
{'loss': 0.7158, 'learning_rate': 5e-06, 'epoch': 1.76} |
|
59%|ββββββ | 157/267 [55:03<36:30, 19.91s/it]
59%|ββββββ | 158/267 [55:23<36:14, 19.95s/it]
{'loss': 0.728, 'learning_rate': 5e-06, 'epoch': 1.78} |
|
59%|ββββββ | 158/267 [55:23<36:14, 19.95s/it]
60%|ββββββ | 159/267 [55:43<35:47, 19.88s/it]
{'loss': 0.7297, 'learning_rate': 5e-06, 'epoch': 1.79} |
|
60%|ββββββ | 159/267 [55:43<35:47, 19.88s/it]
60%|ββββββ | 160/267 [56:02<35:26, 19.88s/it]
{'loss': 0.7226, 'learning_rate': 5e-06, 'epoch': 1.8} |
|
60%|ββββββ | 160/267 [56:02<35:26, 19.88s/it]
60%|ββββββ | 161/267 [56:22<35:08, 19.89s/it]
{'loss': 0.7409, 'learning_rate': 5e-06, 'epoch': 1.81} |
|
60%|ββββββ | 161/267 [56:22<35:08, 19.89s/it]
61%|ββββββ | 162/267 [56:42<34:48, 19.89s/it]
{'loss': 0.7516, 'learning_rate': 5e-06, 'epoch': 1.82} |
|
61%|ββββββ | 162/267 [56:42<34:48, 19.89s/it]
61%|ββββββ | 163/267 [57:02<34:37, 19.97s/it]
{'loss': 0.7132, 'learning_rate': 5e-06, 'epoch': 1.83} |
|
61%|ββββββ | 163/267 [57:02<34:37, 19.97s/it]
61%|βββββββ | 164/267 [57:22<34:15, 19.96s/it]
{'loss': 0.7359, 'learning_rate': 5e-06, 'epoch': 1.84} |
|
61%|βββββββ | 164/267 [57:22<34:15, 19.96s/it]
62%|βββββββ | 165/267 [57:42<33:54, 19.94s/it]
{'loss': 0.7151, 'learning_rate': 5e-06, 'epoch': 1.85} |
|
62%|βββββββ | 165/267 [57:42<33:54, 19.94s/it]
62%|βββββββ | 166/267 [58:02<33:42, 20.03s/it]
{'loss': 0.7157, 'learning_rate': 5e-06, 'epoch': 1.87} |
|
62%|βββββββ | 166/267 [58:02<33:42, 20.03s/it]
63%|βββββββ | 167/267 [58:22<33:18, 19.98s/it]
{'loss': 0.7336, 'learning_rate': 5e-06, 'epoch': 1.88} |
|
63%|βββββββ | 167/267 [58:22<33:18, 19.98s/it]
63%|βββββββ | 168/267 [58:42<33:01, 20.01s/it]
{'loss': 0.7079, 'learning_rate': 5e-06, 'epoch': 1.89} |
|
63%|βββββββ | 168/267 [58:42<33:01, 20.01s/it]
63%|βββββββ | 169/267 [59:03<32:45, 20.05s/it]
{'loss': 0.7388, 'learning_rate': 5e-06, 'epoch': 1.9} |
|
63%|βββββββ | 169/267 [59:03<32:45, 20.05s/it]
64%|βββββββ | 170/267 [59:23<32:26, 20.06s/it]
{'loss': 0.7248, 'learning_rate': 5e-06, 'epoch': 1.91} |
|
64%|βββββββ | 170/267 [59:23<32:26, 20.06s/it]
64%|βββββββ | 171/267 [59:43<32:14, 20.15s/it]
{'loss': 0.7083, 'learning_rate': 5e-06, 'epoch': 1.92} |
|
64%|βββββββ | 171/267 [59:43<32:14, 20.15s/it]
64%|βββββββ | 172/267 [1:00:03<31:58, 20.20s/it]
{'loss': 0.7075, 'learning_rate': 5e-06, 'epoch': 1.93} |
|
64%|βββββββ | 172/267 [1:00:03<31:58, 20.20s/it]
65%|βββββββ | 173/267 [1:00:24<31:40, 20.21s/it]
{'loss': 0.7264, 'learning_rate': 5e-06, 'epoch': 1.94} |
|
65%|βββββββ | 173/267 [1:00:24<31:40, 20.21s/it]
65%|βββββββ | 174/267 [1:00:44<31:13, 20.15s/it]
{'loss': 0.7199, 'learning_rate': 5e-06, 'epoch': 1.96} |
|
65%|βββββββ | 174/267 [1:00:44<31:13, 20.15s/it]
66%|βββββββ | 175/267 [1:01:04<31:00, 20.22s/it]
{'loss': 0.714, 'learning_rate': 5e-06, 'epoch': 1.97} |
|
66%|βββββββ | 175/267 [1:01:04<31:00, 20.22s/it]
66%|βββββββ | 176/267 [1:01:24<30:42, 20.25s/it]
{'loss': 0.716, 'learning_rate': 5e-06, 'epoch': 1.98} |
|
66%|βββββββ | 176/267 [1:01:24<30:42, 20.25s/it]
66%|βββββββ | 177/267 [1:01:44<30:20, 20.23s/it]
{'loss': 0.7152, 'learning_rate': 5e-06, 'epoch': 1.99} |
|
66%|βββββββ | 177/267 [1:01:44<30:20, 20.23s/it]
67%|βββββββ | 178/267 [1:02:08<31:39, 21.35s/it]
{'loss': 0.7016, 'learning_rate': 5e-06, 'epoch': 2.0} |
|
67%|βββββββ | 178/267 [1:02:08<31:39, 21.35s/it]
67%|βββββββ | 179/267 [1:02:39<35:34, 24.25s/it]
{'loss': 0.6147, 'learning_rate': 5e-06, 'epoch': 2.01} |
|
67%|βββββββ | 179/267 [1:02:39<35:34, 24.25s/it]
67%|βββββββ | 180/267 [1:03:00<33:34, 23.16s/it]
{'loss': 0.5979, 'learning_rate': 5e-06, 'epoch': 2.02} |
|
67%|βββββββ | 180/267 [1:03:00<33:34, 23.16s/it]
68%|βββββββ | 181/267 [1:03:20<32:00, 22.34s/it]
{'loss': 0.6181, 'learning_rate': 5e-06, 'epoch': 2.03} |
|
68%|βββββββ | 181/267 [1:03:20<32:00, 22.34s/it]
68%|βββββββ | 182/267 [1:03:41<30:51, 21.78s/it]
{'loss': 0.6053, 'learning_rate': 5e-06, 'epoch': 2.04} |
|
68%|βββββββ | 182/267 [1:03:41<30:51, 21.78s/it]
69%|βββββββ | 183/267 [1:04:01<29:47, 21.28s/it]
{'loss': 0.6088, 'learning_rate': 5e-06, 'epoch': 2.06} |
|
69%|βββββββ | 183/267 [1:04:01<29:47, 21.28s/it]
69%|βββββββ | 184/267 [1:04:21<29:04, 21.02s/it]
{'loss': 0.5949, 'learning_rate': 5e-06, 'epoch': 2.07} |
|
69%|βββββββ | 184/267 [1:04:21<29:04, 21.02s/it]
69%|βββββββ | 185/267 [1:04:42<28:28, 20.84s/it]
{'loss': 0.6015, 'learning_rate': 5e-06, 'epoch': 2.08} |
|
69%|βββββββ | 185/267 [1:04:42<28:28, 20.84s/it]
70%|βββββββ | 186/267 [1:05:02<27:58, 20.72s/it]
{'loss': 0.5888, 'learning_rate': 5e-06, 'epoch': 2.09} |
|
70%|βββββββ | 186/267 [1:05:02<27:58, 20.72s/it]
70%|βββββββ | 187/267 [1:05:23<27:35, 20.70s/it]
{'loss': 0.6106, 'learning_rate': 5e-06, 'epoch': 2.1} |
|
70%|βββββββ | 187/267 [1:05:23<27:35, 20.70s/it]
70%|βββββββ | 188/267 [1:05:43<27:09, 20.63s/it]
{'loss': 0.5986, 'learning_rate': 5e-06, 'epoch': 2.11} |
|
70%|βββββββ | 188/267 [1:05:43<27:09, 20.63s/it]
71%|βββββββ | 189/267 [1:06:04<26:45, 20.59s/it]
{'loss': 0.5915, 'learning_rate': 5e-06, 'epoch': 2.12} |
|
71%|βββββββ | 189/267 [1:06:04<26:45, 20.59s/it]
71%|βββββββ | 190/267 [1:06:24<26:20, 20.52s/it]
{'loss': 0.593, 'learning_rate': 5e-06, 'epoch': 2.13} |
|
71%|βββββββ | 190/267 [1:06:24<26:20, 20.52s/it]
72%|ββββββββ | 191/267 [1:06:45<25:57, 20.49s/it]
{'loss': 0.6051, 'learning_rate': 5e-06, 'epoch': 2.15} |
|
72%|ββββββββ | 191/267 [1:06:45<25:57, 20.49s/it]
72%|ββββββββ | 192/267 [1:07:05<25:33, 20.44s/it]
{'loss': 0.5916, 'learning_rate': 5e-06, 'epoch': 2.16} |
|
72%|ββββββββ | 192/267 [1:07:05<25:33, 20.44s/it]
72%|ββββββββ | 193/267 [1:07:26<25:13, 20.46s/it]
{'loss': 0.5984, 'learning_rate': 5e-06, 'epoch': 2.17} |
|
72%|ββββββββ | 193/267 [1:07:26<25:13, 20.46s/it]
73%|ββββββββ | 194/267 [1:07:46<24:52, 20.44s/it]
{'loss': 0.5863, 'learning_rate': 5e-06, 'epoch': 2.18} |
|
73%|ββββββββ | 194/267 [1:07:46<24:52, 20.44s/it]
73%|ββββββββ | 195/267 [1:08:06<24:33, 20.47s/it]
{'loss': 0.6006, 'learning_rate': 5e-06, 'epoch': 2.19} |
|
73%|ββββββββ | 195/267 [1:08:06<24:33, 20.47s/it]
73%|ββββββββ | 196/267 [1:08:27<24:11, 20.45s/it]
{'loss': 0.5931, 'learning_rate': 5e-06, 'epoch': 2.2} |
|
73%|ββββββββ | 196/267 [1:08:27<24:11, 20.45s/it]
74%|ββββββββ | 197/267 [1:08:47<23:51, 20.45s/it]
{'loss': 0.5818, 'learning_rate': 5e-06, 'epoch': 2.21} |
|
74%|ββββββββ | 197/267 [1:08:47<23:51, 20.45s/it]
74%|ββββββββ | 198/267 [1:09:08<23:28, 20.41s/it]
{'loss': 0.5987, 'learning_rate': 5e-06, 'epoch': 2.22} |
|
74%|ββββββββ | 198/267 [1:09:08<23:28, 20.41s/it]
75%|ββββββββ | 199/267 [1:09:28<23:06, 20.40s/it]
{'loss': 0.5704, 'learning_rate': 5e-06, 'epoch': 2.24} |
|
75%|ββββββββ | 199/267 [1:09:28<23:06, 20.40s/it]
75%|ββββββββ | 200/267 [1:09:48<22:45, 20.39s/it]
{'loss': 0.5757, 'learning_rate': 5e-06, 'epoch': 2.25} |
|
75%|ββββββββ | 200/267 [1:09:48<22:45, 20.39s/it]Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41. |
|
Non-default generation parameters: {'max_length': 4096} |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https: |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. |
|
warnings.warn( |
|
/vol3/ctr/.conda/envs/llava_rest/lib/python3.10/site-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None |
|
warnings.warn( |
|
75%|ββββββββ | 201/267 [1:11:21<46:06, 41.92s/it]
{'loss': 0.6003, 'learning_rate': 5e-06, 'epoch': 2.26} |
|
75%|ββββββββ | 201/267 [1:11:21<46:06, 41.92s/it]
76%|ββββββββ | 202/267 [1:11:41<38:24, 35.45s/it]
{'loss': 0.5656, 'learning_rate': 5e-06, 'epoch': 2.27} |
|
76%|ββββββββ | 202/267 [1:11:41<38:24, 35.45s/it]
76%|ββββββββ | 203/267 [1:12:01<32:55, 30.86s/it]
{'loss': 0.5807, 'learning_rate': 5e-06, 'epoch': 2.28} |
|
76%|ββββββββ | 203/267 [1:12:01<32:55, 30.86s/it]
76%|ββββββββ | 204/267 [1:12:22<29:08, 27.75s/it]
{'loss': 0.6076, 'learning_rate': 5e-06, 'epoch': 2.29} |
|
76%|ββββββββ | 204/267 [1:12:22<29:08, 27.75s/it]
77%|ββββββββ | 205/267 [1:12:42<26:18, 25.45s/it]
{'loss': 0.5916, 'learning_rate': 5e-06, 'epoch': 2.3} |
|
77%|ββββββββ | 205/267 [1:12:42<26:18, 25.45s/it]
77%|ββββββββ | 206/267 [1:13:02<24:19, 23.92s/it]
{'loss': 0.585, 'learning_rate': 5e-06, 'epoch': 2.31} |
|
77%|ββββββββ | 206/267 [1:13:02<24:19, 23.92s/it]
78%|ββββββββ | 207/267 [1:13:22<22:46, 22.78s/it]
{'loss': 0.5925, 'learning_rate': 5e-06, 'epoch': 2.33} |
|
78%|ββββββββ | 207/267 [1:13:22<22:46, 22.78s/it]
78%|ββββββββ | 208/267 [1:13:42<21:31, 21.90s/it]
{'loss': 0.6012, 'learning_rate': 5e-06, 'epoch': 2.34} |
|
78%|ββββββββ | 208/267 [1:13:42<21:31, 21.90s/it]
78%|ββββββββ | 209/267 [1:14:02<20:36, 21.32s/it]
{'loss': 0.6171, 'learning_rate': 5e-06, 'epoch': 2.35} |
|
78%|ββββββββ | 209/267 [1:14:02<20:36, 21.32s/it]
79%|ββββββββ | 210/267 [1:14:22<19:50, 20.88s/it]
{'loss': 0.6051, 'learning_rate': 5e-06, 'epoch': 2.36} |
|
79%|ββββββββ | 210/267 [1:14:22<19:50, 20.88s/it]
79%|ββββββββ | 211/267 [1:14:42<19:14, 20.62s/it]
{'loss': 0.6039, 'learning_rate': 5e-06, 'epoch': 2.37} |
|
79%|ββββββββ | 211/267 [1:14:42<19:14, 20.62s/it]
79%|ββββββββ | 212/267 [1:15:02<18:44, 20.44s/it]
{'loss': 0.5901, 'learning_rate': 5e-06, 'epoch': 2.38} |
|
79%|ββββββββ | 212/267 [1:15:02<18:44, 20.44s/it]
80%|ββββββββ | 213/267 [1:15:22<18:18, 20.34s/it]
{'loss': 0.6036, 'learning_rate': 5e-06, 'epoch': 2.39} |
|
80%|ββββββββ | 213/267 [1:15:22<18:18, 20.34s/it]
80%|ββββββββ | 214/267 [1:15:42<17:52, 20.24s/it]
{'loss': 0.5914, 'learning_rate': 5e-06, 'epoch': 2.4} |
|
80%|ββββββββ | 214/267 [1:15:42<17:52, 20.24s/it]
81%|ββββββββ | 215/267 [1:16:02<17:27, 20.14s/it]
{'loss': 0.5896, 'learning_rate': 5e-06, 'epoch': 2.42} |
|
81%|ββββββββ | 215/267 [1:16:02<17:27, 20.14s/it]
81%|ββββββββ | 216/267 [1:16:22<17:07, 20.15s/it]
{'loss': 0.5889, 'learning_rate': 5e-06, 'epoch': 2.43} |
|
81%|ββββββββ | 216/267 [1:16:22<17:07, 20.15s/it]
81%|βββββββββ | 217/267 [1:16:42<16:45, 20.11s/it]
{'loss': 0.5947, 'learning_rate': 5e-06, 'epoch': 2.44} |
|
81%|βββββββββ | 217/267 [1:16:42<16:45, 20.11s/it]
82%|βββββββββ | 218/267 [1:17:02<16:24, 20.10s/it]
{'loss': 0.6007, 'learning_rate': 5e-06, 'epoch': 2.45} |
|
82%|βββββββββ | 218/267 [1:17:02<16:24, 20.10s/it]
82%|βββββββββ | 219/267 [1:17:22<16:04, 20.10s/it]
{'loss': 0.5879, 'learning_rate': 5e-06, 'epoch': 2.46} |
|
82%|βββββββββ | 219/267 [1:17:22<16:04, 20.10s/it]
82%|βββββββββ | 220/267 [1:17:42<15:44, 20.10s/it]
{'loss': 0.5837, 'learning_rate': 5e-06, 'epoch': 2.47} |
|
82%|βββββββββ | 220/267 [1:17:42<15:44, 20.10s/it]
83%|βββββββββ | 221/267 [1:18:02<15:25, 20.12s/it]
{'loss': 0.6065, 'learning_rate': 5e-06, 'epoch': 2.48} |
|
83%|βββββββββ | 221/267 [1:18:02<15:25, 20.12s/it]
83%|βββββββββ | 222/267 [1:18:22<15:02, 20.07s/it]
{'loss': 0.5973, 'learning_rate': 5e-06, 'epoch': 2.49} |
|
83%|βββββββββ | 222/267 [1:18:22<15:02, 20.07s/it]
84%|βββββββββ | 223/267 [1:18:42<14:39, 20.00s/it]
{'loss': 0.6104, 'learning_rate': 5e-06, 'epoch': 2.51} |
|
84%|βββββββββ | 223/267 [1:18:42<14:39, 20.00s/it]
84%|βββββββββ | 224/267 [1:19:02<14:17, 19.94s/it]
{'loss': 0.605, 'learning_rate': 5e-06, 'epoch': 2.52} |
|
84%|βββββββββ | 224/267 [1:19:02<14:17, 19.94s/it]
84%|βββββββββ | 225/267 [1:19:22<13:57, 19.95s/it]
{'loss': 0.5819, 'learning_rate': 5e-06, 'epoch': 2.53} |
|
84%|βββββββββ | 225/267 [1:19:22<13:57, 19.95s/it]
85%|βββββββββ | 226/267 [1:19:42<13:38, 19.95s/it]
{'loss': 0.5811, 'learning_rate': 5e-06, 'epoch': 2.54} |
|
85%|βββββββββ | 226/267 [1:19:42<13:38, 19.95s/it]
85%|βββββββββ | 227/267 [1:20:02<13:19, 20.00s/it]
{'loss': 0.6005, 'learning_rate': 5e-06, 'epoch': 2.55} |
|
85%|βββββββββ | 227/267 [1:20:02<13:19, 20.00s/it]
85%|βββββββββ | 228/267 [1:20:22<13:00, 20.01s/it]
{'loss': 0.5939, 'learning_rate': 5e-06, 'epoch': 2.56} |
|
85%|βββββββββ | 228/267 [1:20:22<13:00, 20.01s/it]
86%|βββββββββ | 229/267 [1:20:42<12:39, 19.98s/it]
{'loss': 0.5868, 'learning_rate': 5e-06, 'epoch': 2.57} |
|
86%|βββββββββ | 229/267 [1:20:42<12:39, 19.98s/it]
86%|βββββββββ | 230/267 [1:21:02<12:17, 19.93s/it]
{'loss': 0.5846, 'learning_rate': 5e-06, 'epoch': 2.58} |
|
86%|βββββββββ | 230/267 [1:21:02<12:17, 19.93s/it]
87%|βββββββββ | 231/267 [1:21:22<11:55, 19.89s/it]
{'loss': 0.5874, 'learning_rate': 5e-06, 'epoch': 2.6} |
|
87%|βββββββββ | 231/267 [1:21:22<11:55, 19.89s/it]
87%|βββββββββ | 232/267 [1:21:41<11:35, 19.86s/it]
{'loss': 0.5718, 'learning_rate': 5e-06, 'epoch': 2.61} |
|
87%|βββββββββ | 232/267 [1:21:41<11:35, 19.86s/it]
87%|βββββββββ | 233/267 [1:22:02<11:18, 19.96s/it]
{'loss': 0.5966, 'learning_rate': 5e-06, 'epoch': 2.62} |
|
87%|βββββββββ | 233/267 [1:22:02<11:18, 19.96s/it]
88%|βββββββββ | 234/267 [1:22:22<10:59, 19.99s/it]
{'loss': 0.6019, 'learning_rate': 5e-06, 'epoch': 2.63} |
|
88%|βββββββββ | 234/267 [1:22:22<10:59, 19.99s/it]
88%|βββββββββ | 235/267 [1:22:42<10:43, 20.11s/it]
{'loss': 0.5966, 'learning_rate': 5e-06, 'epoch': 2.64} |
|
88%|βββββββββ | 235/267 [1:22:42<10:43, 20.11s/it]
88%|βββββββββ | 236/267 [1:23:02<10:23, 20.13s/it]
{'loss': 0.5949, 'learning_rate': 5e-06, 'epoch': 2.65} |
|
88%|βββββββββ | 236/267 [1:23:02<10:23, 20.13s/it]
89%|βββββββββ | 237/267 [1:23:22<10:01, 20.04s/it]
{'loss': 0.5821, 'learning_rate': 5e-06, 'epoch': 2.66} |
|
89%|βββββββββ | 237/267 [1:23:22<10:01, 20.04s/it]
89%|βββββββββ | 238/267 [1:23:42<09:41, 20.04s/it]
{'loss': 0.5917, 'learning_rate': 5e-06, 'epoch': 2.67} |
|
89%|βββββββββ | 238/267 [1:23:42<09:41, 20.04s/it]
90%|βββββββββ | 239/267 [1:24:02<09:20, 20.02s/it]
{'loss': 0.5951, 'learning_rate': 5e-06, 'epoch': 2.69} |
|
90%|βββββββββ | 239/267 [1:24:02<09:20, 20.02s/it]
90%|βββββββββ | 240/267 [1:24:22<09:01, 20.06s/it]
{'loss': 0.5873, 'learning_rate': 5e-06, 'epoch': 2.7} |
|
90%|βββββββββ | 240/267 [1:24:22<09:01, 20.06s/it]
90%|βββββββββ | 241/267 [1:24:42<08:41, 20.06s/it]
{'loss': 0.6034, 'learning_rate': 5e-06, 'epoch': 2.71} |
|
90%|βββββββββ | 241/267 [1:24:42<08:41, 20.06s/it]
91%|βββββββββ | 242/267 [1:25:02<08:21, 20.08s/it]
{'loss': 0.5986, 'learning_rate': 5e-06, 'epoch': 2.72} |
|
91%|βββββββββ | 242/267 [1:25:02<08:21, 20.08s/it]
91%|βββββββββ | 243/267 [1:25:22<08:00, 20.03s/it]
{'loss': 0.618, 'learning_rate': 5e-06, 'epoch': 2.73} |
|
91%|βββββββββ | 243/267 [1:25:22<08:00, 20.03s/it]
91%|ββββββββββ| 244/267 [1:25:42<07:41, 20.05s/it]
{'loss': 0.5823, 'learning_rate': 5e-06, 'epoch': 2.74} |
|
91%|ββββββββββ| 244/267 [1:25:42<07:41, 20.05s/it]
92%|ββββββββββ| 245/267 [1:26:03<07:21, 20.09s/it]
{'loss': 0.5995, 'learning_rate': 5e-06, 'epoch': 2.75} |
|
92%|ββββββββββ| 245/267 [1:26:03<07:21, 20.09s/it]
92%|ββββββββββ| 246/267 [1:26:23<07:02, 20.11s/it]
{'loss': 0.5813, 'learning_rate': 5e-06, 'epoch': 2.76} |
|
92%|ββββββββββ| 246/267 [1:26:23<07:02, 20.11s/it]
93%|ββββββββββ| 247/267 [1:26:43<06:41, 20.09s/it]
{'loss': 0.5903, 'learning_rate': 5e-06, 'epoch': 2.78} |
|
93%|ββββββββββ| 247/267 [1:26:43<06:41, 20.09s/it]
93%|ββββββββββ| 248/267 [1:27:03<06:21, 20.06s/it]
{'loss': 0.6034, 'learning_rate': 5e-06, 'epoch': 2.79} |
|
93%|ββββββββββ| 248/267 [1:27:03<06:21, 20.06s/it]
93%|ββββββββββ| 249/267 [1:27:23<06:01, 20.08s/it]
{'loss': 0.5975, 'learning_rate': 5e-06, 'epoch': 2.8} |
|
93%|ββββββββββ| 249/267 [1:27:23<06:01, 20.08s/it]
94%|ββββββββββ| 250/267 [1:27:43<05:41, 20.07s/it]
{'loss': 0.5831, 'learning_rate': 5e-06, 'epoch': 2.81} |
|
94%|ββββββββββ| 250/267 [1:27:43<05:41, 20.07s/it]
94%|ββββββββββ| 251/267 [1:28:03<05:20, 20.00s/it]
{'loss': 0.6062, 'learning_rate': 5e-06, 'epoch': 2.82} |
|
94%|ββββββββββ| 251/267 [1:28:03<05:20, 20.00s/it]
94%|ββββββββββ| 252/267 [1:28:23<05:01, 20.10s/it]
{'loss': 0.6077, 'learning_rate': 5e-06, 'epoch': 2.83} |
|
94%|ββββββββββ| 252/267 [1:28:23<05:01, 20.10s/it]
95%|ββββββββββ| 253/267 [1:28:43<04:40, 20.03s/it]
{'loss': 0.6115, 'learning_rate': 5e-06, 'epoch': 2.84} |
|
95%|ββββββββββ| 253/267 [1:28:43<04:40, 20.03s/it]
95%|ββββββββββ| 254/267 [1:29:03<04:21, 20.09s/it]
{'loss': 0.5933, 'learning_rate': 5e-06, 'epoch': 2.85} |
|
95%|ββββββββββ| 254/267 [1:29:03<04:21, 20.09s/it]
96%|ββββββββββ| 255/267 [1:29:23<04:01, 20.10s/it]
{'loss': 0.602, 'learning_rate': 5e-06, 'epoch': 2.87} |
|
96%|ββββββββββ| 255/267 [1:29:23<04:01, 20.10s/it]
96%|ββββββββββ| 256/267 [1:29:43<03:40, 20.04s/it]
{'loss': 0.5888, 'learning_rate': 5e-06, 'epoch': 2.88} |
|
96%|ββββββββββ| 256/267 [1:29:43<03:40, 20.04s/it]
96%|ββββββββββ| 257/267 [1:30:03<03:20, 20.02s/it]
{'loss': 0.5887, 'learning_rate': 5e-06, 'epoch': 2.89} |
|
96%|ββββββββββ| 257/267 [1:30:03<03:20, 20.02s/it]
97%|ββββββββββ| 258/267 [1:30:23<02:59, 19.97s/it]
{'loss': 0.6002, 'learning_rate': 5e-06, 'epoch': 2.9} |
|
97%|ββββββββββ| 258/267 [1:30:23<02:59, 19.97s/it]
97%|ββββββββββ| 259/267 [1:30:43<02:40, 20.03s/it]
{'loss': 0.5965, 'learning_rate': 5e-06, 'epoch': 2.91} |
|
97%|ββββββββββ| 259/267 [1:30:43<02:40, 20.03s/it]
97%|ββββββββββ| 260/267 [1:31:03<02:19, 19.95s/it]
{'loss': 0.5944, 'learning_rate': 5e-06, 'epoch': 2.92} |
|
97%|ββββββββββ| 260/267 [1:31:03<02:19, 19.95s/it]
98%|ββββββββββ| 261/267 [1:31:23<01:59, 19.94s/it]
{'loss': 0.598, 'learning_rate': 5e-06, 'epoch': 2.93} |
|
98%|ββββββββββ| 261/267 [1:31:23<01:59, 19.94s/it]
98%|ββββββββββ| 262/267 [1:31:43<01:39, 19.94s/it]
{'loss': 0.6011, 'learning_rate': 5e-06, 'epoch': 2.94} |
|
98%|ββββββββββ| 262/267 [1:31:43<01:39, 19.94s/it]
99%|ββββββββββ| 263/267 [1:32:03<01:20, 20.02s/it]
{'loss': 0.6041, 'learning_rate': 5e-06, 'epoch': 2.96} |
|
99%|ββββββββββ| 263/267 [1:32:03<01:20, 20.02s/it]
99%|ββββββββββ| 264/267 [1:32:23<01:00, 20.04s/it]
{'loss': 0.5912, 'learning_rate': 5e-06, 'epoch': 2.97} |
|
99%|ββββββββββ| 264/267 [1:32:23<01:00, 20.04s/it]
99%|ββββββββββ| 265/267 [1:32:43<00:40, 20.02s/it]
{'loss': 0.5983, 'learning_rate': 5e-06, 'epoch': 2.98} |
|
99%|ββββββββββ| 265/267 [1:32:43<00:40, 20.02s/it]
100%|ββββββββββ| 266/267 [1:33:03<00:19, 19.96s/it]
{'loss': 0.6011, 'learning_rate': 5e-06, 'epoch': 2.99} |
|
100%|ββββββββββ| 266/267 [1:33:03<00:19, 19.96s/it]
100%|ββββββββββ| 267/267 [1:33:26<00:00, 21.00s/it]
{'loss': 0.589, 'learning_rate': 5e-06, 'epoch': 3.0} |
|
100%|ββββββββββ| 267/267 [1:33:26<00:00, 21.00s/it]
{'train_runtime': 5609.5068, 'train_samples_per_second': 9.122, 'train_steps_per_second': 0.048, 'train_loss': 0.7409698463111335, 'epoch': 3.0} |
|
100%|ββββββββββ| 267/267 [1:33:26<00:00, 21.00s/it]
100%|ββββββββββ| 267/267 [1:33:26<00:00, 21.00s/it] |
|
Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https: |
|
Non-default generation parameters: {'max_length': 4096} |
|
[2024-12-04 15:46:23,016] [INFO] [launch.py:347:main] Process 1931186 exits successfully. |
|
[2024-12-04 15:46:31,026] [INFO] [launch.py:347:main] Process 1931184 exits successfully. |
|
[2024-12-04 15:46:36,032] [INFO] [launch.py:347:main] Process 1931183 exits successfully. |
|
[2024-12-04 15:46:41,039] [INFO] [launch.py:347:main] Process 1931187 exits successfully. |
|
[2024-12-04 15:46:41,039] [INFO] [launch.py:347:main] Process 1931185 exits successfully. |
|
[1;34mwandb[0m: π View run [33mresilient-serenity-341[0m at: [34mhttps: |
|
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20241204_141227-svgisw9q/logs[0m |
|
[2024-12-04 15:47:15,077] [INFO] [launch.py:347:main] Process 1931182 exits successfully. |
|
|