|
[2022-12-16 11:39:50,518] [WARNING] [runner.py:179:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. |
|
[2022-12-16 11:39:50,529] [INFO] [runner.py:508:main] cmd = /home/milan/hf_env/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 run_speech_recognition_seq2seq_streaming.py --deepspeed=ds_config.json --model_name_or_path=openai/whisper-large-v2 --dataset_name=mozilla-foundation/common_voice_11_0 --dataset_config_name=cs --language=czech --train_split_name=train+validation --eval_split_name=test --model_index_name=Whisper Large-v2 Czech CV11 v2 --max_steps=5000 --output_dir=./ --per_device_train_batch_size=32 --per_device_eval_batch_size=8 --gradient_accumulation_steps=2 --logging_steps=25 --learning_rate=1e-5 --warmup_steps=500 --evaluation_strategy=steps --eval_steps=1000 --save_strategy=steps --save_steps=1000 --generation_max_length=225 --length_column_name=input_length --max_duration_in_seconds=30 --text_column_name=sentence --freeze_feature_encoder=False --report_to=tensorboard --metric_for_best_model=wer --greater_is_better=False --load_best_model_at_end --gradient_checkpointing --fp16 --overwrite_output_dir --do_train --do_eval --predict_with_generate --do_normalize_eval --streaming=False --use_auth_token --push_to_hub |
|
[2022-12-16 11:39:52,063] [INFO] [launch.py:142:main] WORLD INFO DICT: {'localhost': [0]} |
|
[2022-12-16 11:39:52,063] [INFO] [launch.py:148:main] nnodes=1, num_local_procs=1, node_rank=0 |
|
[2022-12-16 11:39:52,063] [INFO] [launch.py:161:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]}) |
|
[2022-12-16 11:39:52,063] [INFO] [launch.py:162:main] dist_world_size=1 |
|
[2022-12-16 11:39:52,063] [INFO] [launch.py:164:main] Setting CUDA_VISIBLE_DEVICES=0 |
|
[2022-12-16 11:39:56,169] [INFO] [comm.py:654:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl |
|
12/16/2022 11:39:56 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True |
|
12/16/2022 11:39:56 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments( |
|
_n_gpu=1, |
|
adafactor=False, |
|
adam_beta1=0.9, |
|
adam_beta2=0.999, |
|
adam_epsilon=1e-08, |
|
auto_find_batch_size=False, |
|
bf16=False, |
|
bf16_full_eval=False, |
|
data_seed=None, |
|
dataloader_drop_last=False, |
|
dataloader_num_workers=0, |
|
dataloader_pin_memory=True, |
|
ddp_bucket_cap_mb=None, |
|
ddp_find_unused_parameters=None, |
|
ddp_timeout=1800, |
|
debug=[], |
|
deepspeed=ds_config.json, |
|
disable_tqdm=False, |
|
do_eval=True, |
|
do_predict=False, |
|
do_train=True, |
|
eval_accumulation_steps=None, |
|
eval_delay=0, |
|
eval_steps=1000, |
|
evaluation_strategy=steps, |
|
fp16=True, |
|
fp16_backend=auto, |
|
fp16_full_eval=False, |
|
fp16_opt_level=O1, |
|
fsdp=[], |
|
fsdp_min_num_params=0, |
|
fsdp_transformer_layer_cls_to_wrap=None, |
|
full_determinism=False, |
|
generation_max_length=225, |
|
generation_num_beams=None, |
|
gradient_accumulation_steps=2, |
|
gradient_checkpointing=True, |
|
greater_is_better=False, |
|
group_by_length=False, |
|
half_precision_backend=auto, |
|
hub_model_id=None, |
|
hub_private_repo=False, |
|
hub_strategy=every_save, |
|
hub_token=<HUB_TOKEN>, |
|
ignore_data_skip=False, |
|
include_inputs_for_metrics=False, |
|
jit_mode_eval=False, |
|
label_names=None, |
|
label_smoothing_factor=0.0, |
|
learning_rate=1e-05, |
|
length_column_name=input_length, |
|
load_best_model_at_end=True, |
|
local_rank=0, |
|
log_level=passive, |
|
log_level_replica=passive, |
|
log_on_each_node=True, |
|
logging_dir=./runs/Dec16_11-39-56_129-146-123-136, |
|
logging_first_step=False, |
|
logging_nan_inf_filter=True, |
|
logging_steps=25, |
|
logging_strategy=steps, |
|
lr_scheduler_type=linear, |
|
max_grad_norm=1.0, |
|
max_steps=5000, |
|
metric_for_best_model=wer, |
|
mp_parameters=, |
|
no_cuda=False, |
|
num_train_epochs=3.0, |
|
optim=adamw_hf, |
|
optim_args=None, |
|
output_dir=./, |
|
overwrite_output_dir=True, |
|
past_index=-1, |
|
per_device_eval_batch_size=8, |
|
per_device_train_batch_size=32, |
|
predict_with_generate=True, |
|
prediction_loss_only=False, |
|
push_to_hub=True, |
|
push_to_hub_model_id=None, |
|
push_to_hub_organization=None, |
|
push_to_hub_token=<PUSH_TO_HUB_TOKEN>, |
|
ray_scope=last, |
|
remove_unused_columns=True, |
|
report_to=['tensorboard'], |
|
resume_from_checkpoint=None, |
|
run_name=./, |
|
save_on_each_node=False, |
|
save_steps=1000, |
|
save_strategy=steps, |
|
save_total_limit=None, |
|
seed=42, |
|
sharded_ddp=[], |
|
skip_memory_metrics=True, |
|
sortish_sampler=False, |
|
tf32=None, |
|
torch_compile=False, |
|
torch_compile_backend=None, |
|
torch_compile_mode=None, |
|
torchdynamo=None, |
|
tpu_metrics_debug=False, |
|
tpu_num_cores=None, |
|
use_ipex=False, |
|
use_legacy_prediction_loop=False, |
|
use_mps_device=False, |
|
warmup_ratio=0.0, |
|
warmup_steps=500, |
|
weight_decay=0.0, |
|
xpu_backend=None, |
|
) |
|
12/16/2022 11:39:56 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments( |
|
_n_gpu=1, |
|
adafactor=False, |
|
adam_beta1=0.9, |
|
adam_beta2=0.999, |
|
adam_epsilon=1e-08, |
|
auto_find_batch_size=False, |
|
bf16=False, |
|
bf16_full_eval=False, |
|
data_seed=None, |
|
dataloader_drop_last=False, |
|
dataloader_num_workers=0, |
|
dataloader_pin_memory=True, |
|
ddp_bucket_cap_mb=None, |
|
ddp_find_unused_parameters=None, |
|
ddp_timeout=1800, |
|
debug=[], |
|
deepspeed=ds_config.json, |
|
disable_tqdm=False, |
|
do_eval=True, |
|
do_predict=False, |
|
do_train=True, |
|
eval_accumulation_steps=None, |
|
eval_delay=0, |
|
eval_steps=1000, |
|
evaluation_strategy=steps, |
|
fp16=True, |
|
fp16_backend=auto, |
|
fp16_full_eval=False, |
|
fp16_opt_level=O1, |
|
fsdp=[], |
|
fsdp_min_num_params=0, |
|
fsdp_transformer_layer_cls_to_wrap=None, |
|
full_determinism=False, |
|
generation_max_length=225, |
|
generation_num_beams=None, |
|
gradient_accumulation_steps=2, |
|
gradient_checkpointing=True, |
|
greater_is_better=False, |
|
group_by_length=False, |
|
half_precision_backend=auto, |
|
hub_model_id=None, |
|
hub_private_repo=False, |
|
hub_strategy=every_save, |
|
hub_token=<HUB_TOKEN>, |
|
ignore_data_skip=False, |
|
include_inputs_for_metrics=False, |
|
jit_mode_eval=False, |
|
label_names=None, |
|
label_smoothing_factor=0.0, |
|
learning_rate=1e-05, |
|
length_column_name=input_length, |
|
load_best_model_at_end=True, |
|
local_rank=0, |
|
log_level=passive, |
|
log_level_replica=passive, |
|
log_on_each_node=True, |
|
logging_dir=./runs/Dec16_11-39-56_129-146-123-136, |
|
logging_first_step=False, |
|
logging_nan_inf_filter=True, |
|
logging_steps=25, |
|
logging_strategy=steps, |
|
lr_scheduler_type=linear, |
|
max_grad_norm=1.0, |
|
max_steps=5000, |
|
metric_for_best_model=wer, |
|
mp_parameters=, |
|
no_cuda=False, |
|
num_train_epochs=3.0, |
|
optim=adamw_hf, |
|
optim_args=None, |
|
output_dir=./, |
|
overwrite_output_dir=True, |
|
past_index=-1, |
|
per_device_eval_batch_size=8, |
|
per_device_train_batch_size=32, |
|
predict_with_generate=True, |
|
prediction_loss_only=False, |
|
push_to_hub=True, |
|
push_to_hub_model_id=None, |
|
push_to_hub_organization=None, |
|
push_to_hub_token=<PUSH_TO_HUB_TOKEN>, |
|
ray_scope=last, |
|
remove_unused_columns=True, |
|
report_to=['tensorboard'], |
|
resume_from_checkpoint=None, |
|
run_name=./, |
|
save_on_each_node=False, |
|
save_steps=1000, |
|
save_strategy=steps, |
|
save_total_limit=None, |
|
seed=42, |
|
sharded_ddp=[], |
|
skip_memory_metrics=True, |
|
sortish_sampler=False, |
|
tf32=None, |
|
torch_compile=False, |
|
torch_compile_backend=None, |
|
torch_compile_mode=None, |
|
torchdynamo=None, |
|
tpu_metrics_debug=False, |
|
tpu_num_cores=None, |
|
use_ipex=False, |
|
use_legacy_prediction_loop=False, |
|
use_mps_device=False, |
|
warmup_ratio=0.0, |
|
warmup_steps=500, |
|
weight_decay=0.0, |
|
xpu_backend=None, |
|
) |
|
12/16/2022 11:39:58 - INFO - datasets.info - Loading Dataset Infos from /home/milan/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_11_0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f |
|
12/16/2022 11:39:58 - INFO - datasets.builder - Generating dataset common_voice_11_0 (/home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f) |
|
Downloading and preparing dataset common_voice_11_0/cs to /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f... |
|
12/16/2022 11:39:58 - INFO - datasets.builder - Dataset not on Hf google storage. Downloading and preparing it from source |
|
12/16/2022 11:39:58 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/n_shards.json not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmpkpv72a0e |
|
12/16/2022 11:39:58 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/n_shards.json in cache at /home/milan/.cache/huggingface/datasets/downloads/a64fc6ccf85aaef9b75e8612e97f260d2435e8ffe8b7626c5c08499f466674da |
|
12/16/2022 11:39:58 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/a64fc6ccf85aaef9b75e8612e97f260d2435e8ffe8b7626c5c08499f466674da |
|
12/16/2022 11:39:58 - INFO - datasets.download.download_manager - Downloading took 0.0 min |
|
12/16/2022 11:39:58 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min |
|
12/16/2022 11:39:59 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/train/cs_train_0.tar not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmp38v7cdx_ |
|
12/16/2022 11:40:10 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/train/cs_train_0.tar in cache at /home/milan/.cache/huggingface/datasets/downloads/e65067c9e6680c37326687e28407d42076ea8a79d0230ccfa9f1a30beefa496d |
|
12/16/2022 11:40:10 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/e65067c9e6680c37326687e28407d42076ea8a79d0230ccfa9f1a30beefa496d |
|
12/16/2022 11:40:10 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/dev/cs_dev_0.tar not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmp4h1ecyhc |
|
12/16/2022 11:40:16 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/dev/cs_dev_0.tar in cache at /home/milan/.cache/huggingface/datasets/downloads/faef583dccc4451b03508c77f16c823c3e52f080e7d2696c47ef3bc4da88b993 |
|
12/16/2022 11:40:16 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/faef583dccc4451b03508c77f16c823c3e52f080e7d2696c47ef3bc4da88b993 |
|
12/16/2022 11:40:16 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/test/cs_test_0.tar not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmpns2ng_im |
|
12/16/2022 11:40:22 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/test/cs_test_0.tar in cache at /home/milan/.cache/huggingface/datasets/downloads/3b199c261a4a515c549e967e06831827866d46648e11b24dc2f4b50880b69664 |
|
12/16/2022 11:40:22 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/3b199c261a4a515c549e967e06831827866d46648e11b24dc2f4b50880b69664 |
|
12/16/2022 11:40:23 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/other/cs_other_0.tar not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmp09yiimgu |
|
12/16/2022 11:40:28 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/other/cs_other_0.tar in cache at /home/milan/.cache/huggingface/datasets/downloads/e9daf9a9975b7a926609592de3dd2542f6bfa853753253cd36e1ff97a21d5b16 |
|
12/16/2022 11:40:28 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/e9daf9a9975b7a926609592de3dd2542f6bfa853753253cd36e1ff97a21d5b16 |
|
12/16/2022 11:40:29 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/invalidated/cs_invalidated_0.tar not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmp9u2s9gxm |
|
12/16/2022 11:40:30 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/cs/invalidated/cs_invalidated_0.tar in cache at /home/milan/.cache/huggingface/datasets/downloads/8f0074a128edd8f8eb60cf804a3ae6c95ebcea9a743285d0c76e6b2168fd6695 |
|
12/16/2022 11:40:30 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/8f0074a128edd8f8eb60cf804a3ae6c95ebcea9a743285d0c76e6b2168fd6695 |
|
12/16/2022 11:40:30 - INFO - datasets.download.download_manager - Downloading took 0.0 min |
|
12/16/2022 11:40:31 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min |
|
12/16/2022 11:40:38 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/train.tsv not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmpvbfyt60l |
|
12/16/2022 11:40:39 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/train.tsv in cache at /home/milan/.cache/huggingface/datasets/downloads/e03f690a1b87aaf6a10df06930bea3ea97524f22df3689d7d4f6de39a04ad6df |
|
12/16/2022 11:40:39 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/e03f690a1b87aaf6a10df06930bea3ea97524f22df3689d7d4f6de39a04ad6df |
|
12/16/2022 11:40:39 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/dev.tsv not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmpzcp057ge |
|
12/16/2022 11:40:41 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/dev.tsv in cache at /home/milan/.cache/huggingface/datasets/downloads/5ede6c8121be47f2c2e4cb6be8b87ef855e5e3966ea6288c369f3d760770b29b |
|
12/16/2022 11:40:41 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/5ede6c8121be47f2c2e4cb6be8b87ef855e5e3966ea6288c369f3d760770b29b |
|
12/16/2022 11:40:41 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/test.tsv not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmpq1s9j9wp |
|
12/16/2022 11:40:42 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/test.tsv in cache at /home/milan/.cache/huggingface/datasets/downloads/af41495b66bafe30b9cd41b7495379553a285ecf658214a8659ed096da46180b |
|
12/16/2022 11:40:42 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/af41495b66bafe30b9cd41b7495379553a285ecf658214a8659ed096da46180b |
|
12/16/2022 11:40:43 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/other.tsv not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmpibvglp0c |
|
12/16/2022 11:40:44 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/other.tsv in cache at /home/milan/.cache/huggingface/datasets/downloads/0b1cf69c260eb7bfb5a97457676fa623c013d75427595de37d10c03c53b2a184 |
|
12/16/2022 11:40:44 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/0b1cf69c260eb7bfb5a97457676fa623c013d75427595de37d10c03c53b2a184 |
|
12/16/2022 11:40:45 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/invalidated.tsv not found in cache or force_download set to True, downloading to /home/milan/.cache/huggingface/datasets/downloads/tmpxneku9q5 |
|
12/16/2022 11:40:45 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/cs/invalidated.tsv in cache at /home/milan/.cache/huggingface/datasets/downloads/678b61c8b7174e01bad8858c770f89625c71e9cbcfe3a7ca7905d4fab962dcbf |
|
12/16/2022 11:40:45 - INFO - datasets.utils.file_utils - creating metadata file for /home/milan/.cache/huggingface/datasets/downloads/678b61c8b7174e01bad8858c770f89625c71e9cbcfe3a7ca7905d4fab962dcbf |
|
12/16/2022 11:40:45 - INFO - datasets.download.download_manager - Downloading took 0.0 min |
|
12/16/2022 11:40:45 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min |
|
12/16/2022 11:40:45 - INFO - datasets.utils.info_utils - Unable to verify checksums. |
|
12/16/2022 11:40:45 - INFO - datasets.builder - Generating train split |
|
12/16/2022 11:40:50 - INFO - datasets.builder - Generating validation split |
|
12/16/2022 11:40:52 - INFO - datasets.builder - Generating test split |
|
12/16/2022 11:40:54 - INFO - datasets.builder - Generating other split |
|
12/16/2022 11:40:57 - INFO - datasets.builder - Generating invalidated split |
|
12/16/2022 11:40:57 - INFO - datasets.utils.info_utils - Unable to verify splits sizes. |
|
Dataset common_voice_11_0 downloaded and prepared to /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f. Subsequent calls will reuse this data. |
|
12/16/2022 11:40:59 - INFO - datasets.info - Loading Dataset Infos from /home/milan/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_11_0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f |
|
12/16/2022 11:40:59 - INFO - datasets.builder - Overwrite dataset info from restored data version. |
|
12/16/2022 11:40:59 - INFO - datasets.info - Loading Dataset info from /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f |
|
12/16/2022 11:40:59 - WARNING - datasets.builder - Found cached dataset common_voice_11_0 (/home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f) |
|
12/16/2022 11:40:59 - INFO - datasets.info - Loading Dataset info from /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f |
|
12/16/2022 11:41:01 - INFO - datasets.info - Loading Dataset Infos from /home/milan/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_11_0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f |
|
12/16/2022 11:41:01 - INFO - datasets.builder - Overwrite dataset info from restored data version. |
|
12/16/2022 11:41:01 - INFO - datasets.info - Loading Dataset info from /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f |
|
12/16/2022 11:41:01 - WARNING - datasets.builder - Found cached dataset common_voice_11_0 (/home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f) |
|
12/16/2022 11:41:01 - INFO - datasets.info - Loading Dataset info from /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f |
|
12/16/2022 11:42:37 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-3d5c448b6a2bf0f7.arrow |
|
12/16/2022 12:19:41 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-7c1ff4193a3aa668.arrow |
|
12/16/2022 12:39:01 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/milan/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/cs/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-3470671e4cfe112f.arrow |
|
12/16/2022 12:39:03 - WARNING - huggingface_hub.repository - /home/milan/whisper-large2-czech-cv11-v2/./ is already a clone of https://huggingface.co/mikr/whisper-large2-czech-cv11-v2. Make sure you pull the latest changes with `repo.git_pull()`. |
|
[2022-12-16 12:39:07,229] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.7, git-hash=unknown, git-branch=unknown |
|
[2022-12-16 12:39:08,450] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False |
|
[2022-12-16 12:39:09,626] [WARNING] [cpu_adam.py:83:__init__] FP16 params for CPUAdam may not work on AMD CPUs |
|
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination |
|
ninja: no work to do. |
|
Time to load cpu_adam op: 2.82881236076355 seconds |
|
Adam Optimizer #0 is created with AVX2 arithmetic capability. |
|
Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1 |
|
[2022-12-16 12:39:14,051] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer |
|
[2022-12-16 12:39:14,354] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam |
|
[2022-12-16 12:39:14,354] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'> |
|
[2022-12-16 12:39:14,354] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer |
|
[2022-12-16 12:39:14,354] [INFO] [stage_1_and_2.py:140:__init__] Reduce bucket size 200000000 |
|
[2022-12-16 12:39:14,355] [INFO] [stage_1_and_2.py:141:__init__] Allgather bucket size 200000000 |
|
[2022-12-16 12:39:14,355] [INFO] [stage_1_and_2.py:142:__init__] CPU Offload: True |
|
[2022-12-16 12:39:14,355] [INFO] [stage_1_and_2.py:143:__init__] Round robin gradient partitioning: False |
|
ninja: no work to do. |
|
Time to load utils op: 0.3903844356536865 seconds |
|
Rank: 0 partition count [1] and sizes[(1543304960, False)] |
|
[2022-12-16 12:39:18,143] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states |
|
[2022-12-16 12:39:18,144] [INFO] [utils.py:828:see_memory_usage] MA 3.0 GB Max_MA 3.0 GB CA 5.99 GB Max_CA 6 GB |
|
[2022-12-16 12:39:18,144] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 15.41 GB, percent = 7.8% |
|
[2022-12-16 12:39:22,046] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states |
|
[2022-12-16 12:39:22,047] [INFO] [utils.py:828:see_memory_usage] MA 3.0 GB Max_MA 3.0 GB CA 5.99 GB Max_CA 6 GB |
|
[2022-12-16 12:39:22,047] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.07 GB, percent = 17.8% |
|
[2022-12-16 12:39:22,047] [INFO] [stage_1_and_2.py:525:__init__] optimizer state initialized |
|
[2022-12-16 12:39:22,119] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer |
|
[2022-12-16 12:39:22,120] [INFO] [utils.py:828:see_memory_usage] MA 3.0 GB Max_MA 3.0 GB CA 5.99 GB Max_CA 6 GB |
|
[2022-12-16 12:39:22,120] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.07 GB, percent = 17.8% |
|
[2022-12-16 12:39:22,144] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw |
|
[2022-12-16 12:39:22,144] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = WarmupLR |
|
[2022-12-16 12:39:22,144] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = <deepspeed.runtime.lr_schedules.WarmupLR object at 0x7f3094c6ecd0> |
|
[2022-12-16 12:39:22,144] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:39:22,146] [INFO] [config.py:1020:print] DeepSpeedEngine configuration: |
|
[2022-12-16 12:39:22,146] [INFO] [config.py:1024:print] activation_checkpointing_config { |
|
"partition_activations": false, |
|
"contiguous_memory_optimization": false, |
|
"cpu_checkpointing": false, |
|
"number_checkpoints": null, |
|
"synchronize_checkpoint_boundary": false, |
|
"profile": false |
|
} |
|
[2022-12-16 12:39:22,146] [INFO] [config.py:1024:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} |
|
[2022-12-16 12:39:22,146] [INFO] [config.py:1024:print] amp_enabled .................. False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] amp_params ................... False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] autotuning_config ............ { |
|
"enabled": false, |
|
"start_step": null, |
|
"end_step": null, |
|
"metric_path": null, |
|
"arg_mappings": null, |
|
"metric": "throughput", |
|
"model_info": null, |
|
"results_dir": "autotuning_results", |
|
"exps_dir": "autotuning_exps", |
|
"overwrite": true, |
|
"fast": true, |
|
"start_profile_step": 3, |
|
"end_profile_step": 5, |
|
"tuner_type": "gridsearch", |
|
"tuner_early_stopping": 5, |
|
"tuner_num_trials": 50, |
|
"model_info_path": null, |
|
"mp_size": 1, |
|
"max_train_batch_size": null, |
|
"min_train_batch_size": 1, |
|
"max_train_micro_batch_size_per_gpu": 1.024000e+03, |
|
"min_train_micro_batch_size_per_gpu": 1, |
|
"num_tuning_micro_batch_sizes": 3 |
|
} |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] bfloat16_enabled ............. False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] checkpoint_parallel_write_pipeline False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] checkpoint_tag_validation_enabled True |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] checkpoint_tag_validation_fail False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f3090ac2ee0> |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] communication_data_type ...... None |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] curriculum_enabled ........... False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] curriculum_params ............ False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] dataloader_drop_last ......... False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] disable_allgather ............ False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] dump_state ................... False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1} |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] eigenvalue_enabled ........... False |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] eigenvalue_gas_boundary_resolution 1 |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] eigenvalue_layer_name ........ bert.encoder.layer |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] eigenvalue_layer_num ......... 0 |
|
[2022-12-16 12:39:22,147] [INFO] [config.py:1024:print] eigenvalue_max_iter .......... 100 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] eigenvalue_stability ......... 1e-06 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] eigenvalue_tol ............... 0.01 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] eigenvalue_verbose ........... False |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] elasticity_enabled ........... False |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] flops_profiler_config ........ { |
|
"enabled": false, |
|
"profile_step": 1, |
|
"module_depth": -1, |
|
"top_modules": 1, |
|
"detailed": true, |
|
"output_file": null |
|
} |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] fp16_auto_cast ............... False |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] fp16_enabled ................. True |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] fp16_master_weights_and_gradients False |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] global_rank .................. 0 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] grad_accum_dtype ............. None |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] gradient_accumulation_steps .. 2 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] gradient_clipping ............ 1.0 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] gradient_predivide_factor .... 1.0 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] initial_dynamic_scale ........ 65536 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] load_universal_checkpoint .... False |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] loss_scale ................... 0 |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] memory_breakdown ............. False |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] monitor_config ............... <deepspeed.monitor.config.DeepSpeedMonitorConfig object at 0x7f3078f292b0> |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] nebula_config ................ { |
|
"enabled": false, |
|
"persistent_storage_path": null, |
|
"persistent_time_interval": 100, |
|
"num_of_version_in_retention": 2, |
|
"enable_nebula_load": true, |
|
"load_path": null |
|
} |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] optimizer_legacy_fusion ...... False |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] optimizer_name ............... adamw |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] optimizer_params ............. {'lr': 1e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0} |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} |
|
[2022-12-16 12:39:22,148] [INFO] [config.py:1024:print] pld_enabled .................. False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] pld_params ................... False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] prescale_gradients ........... False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] scheduler_name ............... WarmupLR |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 1e-05, 'warmup_num_steps': 500} |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] sparse_attention ............. None |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] sparse_gradients_enabled ..... False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] steps_per_print .............. 10 |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] train_batch_size ............. 64 |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] train_micro_batch_size_per_gpu 32 |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] use_node_local_storage ....... False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] wall_clock_breakdown ......... False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] world_size ................... 1 |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] zero_allow_untested_optimizer False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] zero_enabled ................. True |
|
[2022-12-16 12:39:22,149] [INFO] [config.py:1024:print] zero_optimization_stage ...... 2 |
|
[2022-12-16 12:39:22,150] [INFO] [config.py:1009:print_user_config] json = { |
|
"fp16": { |
|
"enabled": true, |
|
"loss_scale": 0, |
|
"loss_scale_window": 1000, |
|
"initial_scale_power": 16, |
|
"hysteresis": 2, |
|
"min_loss_scale": 1 |
|
}, |
|
"optimizer": { |
|
"type": "AdamW", |
|
"params": { |
|
"lr": 1e-05, |
|
"betas": [0.9, 0.999], |
|
"eps": 1e-08, |
|
"weight_decay": 0.0 |
|
} |
|
}, |
|
"scheduler": { |
|
"type": "WarmupLR", |
|
"params": { |
|
"warmup_min_lr": 0, |
|
"warmup_max_lr": 1e-05, |
|
"warmup_num_steps": 500 |
|
} |
|
}, |
|
"zero_optimization": { |
|
"stage": 2, |
|
"offload_optimizer": { |
|
"device": "cpu", |
|
"pin_memory": true |
|
}, |
|
"allgather_partitions": true, |
|
"allgather_bucket_size": 2.000000e+08, |
|
"overlap_comm": true, |
|
"reduce_scatter": true, |
|
"reduce_bucket_size": 2.000000e+08, |
|
"contiguous_gradients": true |
|
}, |
|
"gradient_accumulation_steps": 2, |
|
"gradient_clipping": 1.0, |
|
"train_batch_size": 64, |
|
"train_micro_batch_size_per_gpu": 32 |
|
} |
|
Time to load utils op: 0.0003771781921386719 seconds |
|
[2022-12-16 12:39:47,145] [INFO] [timer.py:197:stop] 0/4, RunningAvgSamplesPerSec=6.344583010207686, CurrSamplesPerSec=5.697173747342324, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:39:58,446] [INFO] [timer.py:197:stop] 0/6, RunningAvgSamplesPerSec=6.354105820400516, CurrSamplesPerSec=5.7200148121144485, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:40:09,779] [INFO] [timer.py:197:stop] 0/8, RunningAvgSamplesPerSec=6.3511059843491315, CurrSamplesPerSec=5.706168744069791, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:40:21,155] [INFO] [timer.py:197:stop] 0/10, RunningAvgSamplesPerSec=6.343221806642607, CurrSamplesPerSec=5.690943305296631, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:40:32,488] [INFO] [timer.py:197:stop] 0/12, RunningAvgSamplesPerSec=6.343072944394993, CurrSamplesPerSec=5.725201867756091, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:40:43,873] [INFO] [timer.py:197:stop] 0/14, RunningAvgSamplesPerSec=6.337683883211206, CurrSamplesPerSec=5.689833779599867, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:40:55,274] [INFO] [timer.py:197:stop] 0/16, RunningAvgSamplesPerSec=6.332389036245992, CurrSamplesPerSec=5.669597853035386, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:41:06,603] [INFO] [timer.py:197:stop] 0/18, RunningAvgSamplesPerSec=6.33282615782689, CurrSamplesPerSec=5.706427845798502, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:41:17,385] [INFO] [stage_1_and_2.py:1765:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 65536 |
|
[2022-12-16 12:41:17,386] [INFO] [logging.py:68:log_dist] [Rank 0] step=10, skipped=1, lr=[3.535580269163017e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:41:17,387] [INFO] [timer.py:197:stop] 0/20, RunningAvgSamplesPerSec=6.372488127559496, CurrSamplesPerSec=6.3796996062220375, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:41:28,769] [INFO] [timer.py:197:stop] 0/22, RunningAvgSamplesPerSec=6.366574823404308, CurrSamplesPerSec=5.682797328337197, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:41:40,119] [INFO] [timer.py:197:stop] 0/24, RunningAvgSamplesPerSec=6.363487793851469, CurrSamplesPerSec=5.685089104119074, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:41:50,879] [INFO] [stage_1_and_2.py:1765:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768.0 |
|
[2022-12-16 12:41:50,881] [INFO] [timer.py:197:stop] 0/26, RunningAvgSamplesPerSec=6.392071905977898, CurrSamplesPerSec=6.394088716886378, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:42:02,274] [INFO] [timer.py:197:stop] 0/28, RunningAvgSamplesPerSec=6.384638482589085, CurrSamplesPerSec=5.6451629089442354, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:42:13,644] [INFO] [timer.py:197:stop] 0/30, RunningAvgSamplesPerSec=6.380158521481853, CurrSamplesPerSec=5.697199381370964, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:42:24,983] [INFO] [timer.py:197:stop] 0/32, RunningAvgSamplesPerSec=6.377276100579833, CurrSamplesPerSec=5.709599137836787, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:42:36,318] [INFO] [timer.py:197:stop] 0/34, RunningAvgSamplesPerSec=6.374173569489135, CurrSamplesPerSec=5.686944867860612, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:42:47,672] [INFO] [timer.py:197:stop] 0/36, RunningAvgSamplesPerSec=6.371391329214875, CurrSamplesPerSec=5.698267509700741, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:42:59,049] [INFO] [timer.py:197:stop] 0/38, RunningAvgSamplesPerSec=6.3681956430878595, CurrSamplesPerSec=5.687853197846394, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:43:10,396] [INFO] [logging.py:68:log_dist] [Rank 0] step=20, skipped=2, lr=[4.650931663140581e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:43:10,397] [INFO] [timer.py:197:stop] 0/40, RunningAvgSamplesPerSec=6.366312333365177, CurrSamplesPerSec=5.696548687004366, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:43:21,770] [INFO] [timer.py:197:stop] 0/42, RunningAvgSamplesPerSec=6.36387061297112, CurrSamplesPerSec=5.692867363700557, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:43:32,465] [INFO] [stage_1_and_2.py:1765:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0 |
|
[2022-12-16 12:43:32,467] [INFO] [timer.py:197:stop] 0/44, RunningAvgSamplesPerSec=6.381094936769063, CurrSamplesPerSec=6.406390958081914, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:43:43,780] [INFO] [timer.py:197:stop] 0/46, RunningAvgSamplesPerSec=6.379753390930982, CurrSamplesPerSec=5.7075707942970615, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:43:55,117] [INFO] [timer.py:197:stop] 0/48, RunningAvgSamplesPerSec=6.377019018004726, CurrSamplesPerSec=5.686304945402372, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:44:06,451] [INFO] [timer.py:197:stop] 0/50, RunningAvgSamplesPerSec=6.37543476454316, CurrSamplesPerSec=5.709647715323768, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.3246, 'learning_rate': 4.973833272194737e-06, 'epoch': 0.11} |
|
[2022-12-16 12:44:17,771] [INFO] [timer.py:197:stop] 0/52, RunningAvgSamplesPerSec=6.373949262772854, CurrSamplesPerSec=5.691750812101744, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:44:29,101] [INFO] [timer.py:197:stop] 0/54, RunningAvgSamplesPerSec=6.372228449261265, CurrSamplesPerSec=5.69880753129992, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:44:40,455] [INFO] [timer.py:197:stop] 0/56, RunningAvgSamplesPerSec=6.370542883426897, CurrSamplesPerSec=5.680790624361286, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:44:51,786] [INFO] [timer.py:197:stop] 0/58, RunningAvgSamplesPerSec=6.369515579272822, CurrSamplesPerSec=5.699690125593197, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:45:03,101] [INFO] [logging.py:68:log_dist] [Rank 0] step=30, skipped=3, lr=[5.303370403744525e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:45:03,102] [INFO] [timer.py:197:stop] 0/60, RunningAvgSamplesPerSec=6.368480625171759, CurrSamplesPerSec=5.7089997596752, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:45:14,482] [INFO] [timer.py:197:stop] 0/62, RunningAvgSamplesPerSec=6.3666495482652, CurrSamplesPerSec=5.675317465009016, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:45:25,861] [INFO] [timer.py:197:stop] 0/64, RunningAvgSamplesPerSec=6.364903051268984, CurrSamplesPerSec=5.685087177683414, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:45:37,183] [INFO] [timer.py:197:stop] 0/66, RunningAvgSamplesPerSec=6.364047355448845, CurrSamplesPerSec=5.707811575579985, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:45:48,536] [INFO] [timer.py:197:stop] 0/68, RunningAvgSamplesPerSec=6.362912684415659, CurrSamplesPerSec=5.695592380490683, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:45:59,895] [INFO] [timer.py:197:stop] 0/70, RunningAvgSamplesPerSec=6.361712214957278, CurrSamplesPerSec=5.683929625983731, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:46:11,275] [INFO] [timer.py:197:stop] 0/72, RunningAvgSamplesPerSec=6.360228512038433, CurrSamplesPerSec=5.675749697315712, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:46:22,617] [INFO] [timer.py:197:stop] 0/74, RunningAvgSamplesPerSec=6.359199676451197, CurrSamplesPerSec=5.6725835196614005, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:46:33,974] [INFO] [timer.py:197:stop] 0/76, RunningAvgSamplesPerSec=6.358251401819239, CurrSamplesPerSec=5.690369308760629, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:46:45,348] [INFO] [timer.py:197:stop] 0/78, RunningAvgSamplesPerSec=6.357095959071015, CurrSamplesPerSec=5.688037839526133, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:46:56,669] [INFO] [logging.py:68:log_dist] [Rank 0] step=40, skipped=3, lr=[5.810371073215365e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:46:56,670] [INFO] [timer.py:197:stop] 0/80, RunningAvgSamplesPerSec=6.356839339793331, CurrSamplesPerSec=5.7071434090115964, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:47:08,047] [INFO] [timer.py:197:stop] 0/82, RunningAvgSamplesPerSec=6.355727674187336, CurrSamplesPerSec=5.6961031281183585, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:47:19,383] [INFO] [timer.py:197:stop] 0/84, RunningAvgSamplesPerSec=6.355341089950659, CurrSamplesPerSec=5.707567396319488, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:47:30,685] [INFO] [timer.py:197:stop] 0/86, RunningAvgSamplesPerSec=6.354985244848452, CurrSamplesPerSec=5.707153601441993, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:47:42,027] [INFO] [timer.py:197:stop] 0/88, RunningAvgSamplesPerSec=6.354542229262944, CurrSamplesPerSec=5.702634405066058, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:47:53,384] [INFO] [timer.py:197:stop] 0/90, RunningAvgSamplesPerSec=6.353934422195688, CurrSamplesPerSec=5.68178237423023, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:48:04,773] [INFO] [timer.py:197:stop] 0/92, RunningAvgSamplesPerSec=6.353584416009719, CurrSamplesPerSec=5.710814552007692, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:48:16,116] [INFO] [timer.py:197:stop] 0/94, RunningAvgSamplesPerSec=6.352760486107527, CurrSamplesPerSec=5.688323744419653, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:48:27,504] [INFO] [timer.py:197:stop] 0/96, RunningAvgSamplesPerSec=6.351592342068352, CurrSamplesPerSec=5.652738899857816, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:48:38,843] [INFO] [timer.py:197:stop] 0/98, RunningAvgSamplesPerSec=6.351100856548269, CurrSamplesPerSec=5.697337954217791, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:48:50,211] [INFO] [logging.py:68:log_dist] [Rank 0] step=50, skipped=3, lr=[6.195318418690893e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:48:50,213] [INFO] [timer.py:197:stop] 0/100, RunningAvgSamplesPerSec=6.350433200108205, CurrSamplesPerSec=5.6906366285964465, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1691, 'learning_rate': 6.195318418690893e-06, 'epoch': 0.21} |
|
[2022-12-16 12:49:01,599] [INFO] [timer.py:197:stop] 0/102, RunningAvgSamplesPerSec=6.3496171237170556, CurrSamplesPerSec=5.663726039473994, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:49:12,937] [INFO] [timer.py:197:stop] 0/104, RunningAvgSamplesPerSec=6.349453781242221, CurrSamplesPerSec=5.705724589763913, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:49:24,295] [INFO] [timer.py:197:stop] 0/106, RunningAvgSamplesPerSec=6.349035750526656, CurrSamplesPerSec=5.715350621958373, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:49:35,667] [INFO] [timer.py:197:stop] 0/108, RunningAvgSamplesPerSec=6.348487560326544, CurrSamplesPerSec=5.6962899981012045, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:49:46,955] [INFO] [timer.py:197:stop] 0/110, RunningAvgSamplesPerSec=6.348730187619528, CurrSamplesPerSec=5.728014403669337, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:49:58,281] [INFO] [timer.py:197:stop] 0/112, RunningAvgSamplesPerSec=6.348737425707652, CurrSamplesPerSec=5.726789458927852, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:50:09,648] [INFO] [timer.py:197:stop] 0/114, RunningAvgSamplesPerSec=6.348266706061173, CurrSamplesPerSec=5.67953604234834, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:50:21,004] [INFO] [timer.py:197:stop] 0/116, RunningAvgSamplesPerSec=6.347940860010868, CurrSamplesPerSec=5.693142887368156, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:50:32,344] [INFO] [timer.py:197:stop] 0/118, RunningAvgSamplesPerSec=6.347587204060109, CurrSamplesPerSec=5.698724779414815, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:50:43,692] [INFO] [logging.py:68:log_dist] [Rank 0] step=60, skipped=3, lr=[6.505722008216461e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:50:43,694] [INFO] [timer.py:197:stop] 0/120, RunningAvgSamplesPerSec=6.347324281790804, CurrSamplesPerSec=5.701369430780287, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:50:55,056] [INFO] [timer.py:197:stop] 0/122, RunningAvgSamplesPerSec=6.346958901073795, CurrSamplesPerSec=5.704603232008679, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:51:06,414] [INFO] [timer.py:197:stop] 0/124, RunningAvgSamplesPerSec=6.346601483736209, CurrSamplesPerSec=5.687951784405772, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:51:17,801] [INFO] [timer.py:197:stop] 0/126, RunningAvgSamplesPerSec=6.34594390644621, CurrSamplesPerSec=5.660766399429713, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:51:29,174] [INFO] [timer.py:197:stop] 0/128, RunningAvgSamplesPerSec=6.345436031798535, CurrSamplesPerSec=5.684730568773232, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:51:40,485] [INFO] [timer.py:197:stop] 0/130, RunningAvgSamplesPerSec=6.345409778459219, CurrSamplesPerSec=5.714689448717287, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:51:51,825] [INFO] [timer.py:197:stop] 0/132, RunningAvgSamplesPerSec=6.345255907930603, CurrSamplesPerSec=5.712015898245238, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:52:03,176] [INFO] [timer.py:197:stop] 0/134, RunningAvgSamplesPerSec=6.345007895933262, CurrSamplesPerSec=5.699063303049912, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:52:14,543] [INFO] [timer.py:197:stop] 0/136, RunningAvgSamplesPerSec=6.344485483804747, CurrSamplesPerSec=5.668854321704523, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:52:25,910] [INFO] [timer.py:197:stop] 0/138, RunningAvgSamplesPerSec=6.344204784261706, CurrSamplesPerSec=5.686490691190197, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:52:37,258] [INFO] [logging.py:68:log_dist] [Rank 0] step=70, skipped=3, lr=[6.765821034569313e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:52:37,259] [INFO] [timer.py:197:stop] 0/140, RunningAvgSamplesPerSec=6.343738385340108, CurrSamplesPerSec=5.668696780364775, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:52:48,644] [INFO] [timer.py:197:stop] 0/142, RunningAvgSamplesPerSec=6.343346662062031, CurrSamplesPerSec=5.707930760122382, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:52:59,988] [INFO] [timer.py:197:stop] 0/144, RunningAvgSamplesPerSec=6.343367542978673, CurrSamplesPerSec=5.698923436372281, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:53:11,361] [INFO] [timer.py:197:stop] 0/146, RunningAvgSamplesPerSec=6.343167017241652, CurrSamplesPerSec=5.693951984773787, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:53:22,734] [INFO] [timer.py:197:stop] 0/148, RunningAvgSamplesPerSec=6.342659205769162, CurrSamplesPerSec=5.658159989352935, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:53:34,026] [INFO] [timer.py:197:stop] 0/150, RunningAvgSamplesPerSec=6.342879909127556, CurrSamplesPerSec=5.734806308786819, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1644, 'learning_rate': 6.881634451095711e-06, 'epoch': 0.32} |
|
[2022-12-16 12:53:45,320] [INFO] [timer.py:197:stop] 0/152, RunningAvgSamplesPerSec=6.342917910651529, CurrSamplesPerSec=5.710984649224423, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:53:56,669] [INFO] [timer.py:197:stop] 0/154, RunningAvgSamplesPerSec=6.342733602913983, CurrSamplesPerSec=5.707075945747419, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:54:07,978] [INFO] [timer.py:197:stop] 0/156, RunningAvgSamplesPerSec=6.3428841523302095, CurrSamplesPerSec=5.728329034097408, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:54:19,322] [INFO] [timer.py:197:stop] 0/158, RunningAvgSamplesPerSec=6.34275224260025, CurrSamplesPerSec=5.702134598060043, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:54:30,661] [INFO] [logging.py:68:log_dist] [Rank 0] step=80, skipped=3, lr=[6.9896691039239e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:54:30,663] [INFO] [timer.py:197:stop] 0/160, RunningAvgSamplesPerSec=6.342683694089697, CurrSamplesPerSec=5.7029733933644255, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:54:42,087] [INFO] [timer.py:197:stop] 0/162, RunningAvgSamplesPerSec=6.342013189520437, CurrSamplesPerSec=5.697145453446462, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:54:53,432] [INFO] [timer.py:197:stop] 0/164, RunningAvgSamplesPerSec=6.341816271259256, CurrSamplesPerSec=5.701045404740299, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:55:04,839] [INFO] [timer.py:197:stop] 0/166, RunningAvgSamplesPerSec=6.341202369491997, CurrSamplesPerSec=5.623904352955541, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:55:16,160] [INFO] [timer.py:197:stop] 0/168, RunningAvgSamplesPerSec=6.3412941460944685, CurrSamplesPerSec=5.697974798834354, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:55:27,484] [INFO] [timer.py:197:stop] 0/170, RunningAvgSamplesPerSec=6.341284673747386, CurrSamplesPerSec=5.709030599807577, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:55:38,824] [INFO] [timer.py:197:stop] 0/172, RunningAvgSamplesPerSec=6.3411692034098985, CurrSamplesPerSec=5.697860383670918, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:55:50,137] [INFO] [timer.py:197:stop] 0/174, RunningAvgSamplesPerSec=6.341308142694129, CurrSamplesPerSec=5.705610833345307, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:56:01,451] [INFO] [timer.py:197:stop] 0/176, RunningAvgSamplesPerSec=6.341335680588037, CurrSamplesPerSec=5.69166705821747, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:56:12,805] [INFO] [timer.py:197:stop] 0/178, RunningAvgSamplesPerSec=6.3412695672311505, CurrSamplesPerSec=5.6963572066037775, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:56:24,116] [INFO] [logging.py:68:log_dist] [Rank 0] step=90, skipped=3, lr=[7.186146009413563e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:56:24,118] [INFO] [timer.py:197:stop] 0/180, RunningAvgSamplesPerSec=6.3413842936177165, CurrSamplesPerSec=5.70348134547156, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:56:35,476] [INFO] [timer.py:197:stop] 0/182, RunningAvgSamplesPerSec=6.341238572121128, CurrSamplesPerSec=5.688341102134533, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:56:46,831] [INFO] [timer.py:197:stop] 0/184, RunningAvgSamplesPerSec=6.341057889364869, CurrSamplesPerSec=5.673437384747106, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:56:58,196] [INFO] [timer.py:197:stop] 0/186, RunningAvgSamplesPerSec=6.340863335113536, CurrSamplesPerSec=5.687693634667262, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:57:09,533] [INFO] [timer.py:197:stop] 0/188, RunningAvgSamplesPerSec=6.340707296916432, CurrSamplesPerSec=5.692211623233167, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:57:20,857] [INFO] [timer.py:197:stop] 0/190, RunningAvgSamplesPerSec=6.3407842661084945, CurrSamplesPerSec=5.694887684662681, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:57:32,184] [INFO] [timer.py:197:stop] 0/192, RunningAvgSamplesPerSec=6.340805746497473, CurrSamplesPerSec=5.7107445720947325, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:57:43,477] [INFO] [timer.py:197:stop] 0/194, RunningAvgSamplesPerSec=6.3409575863837775, CurrSamplesPerSec=5.7252595029953826, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:57:54,814] [INFO] [timer.py:197:stop] 0/196, RunningAvgSamplesPerSec=6.340975998120396, CurrSamplesPerSec=5.690191511550644, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:58:06,164] [INFO] [timer.py:197:stop] 0/198, RunningAvgSamplesPerSec=6.340876962362114, CurrSamplesPerSec=5.713173732372909, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:58:17,522] [INFO] [logging.py:68:log_dist] [Rank 0] step=100, skipped=3, lr=[7.361221988663844e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 12:58:17,524] [INFO] [timer.py:197:stop] 0/200, RunningAvgSamplesPerSec=6.340767966334212, CurrSamplesPerSec=5.680717771813953, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1458, 'learning_rate': 7.361221988663844e-06, 'epoch': 0.42} |
|
[2022-12-16 12:58:28,862] [INFO] [timer.py:197:stop] 0/202, RunningAvgSamplesPerSec=6.340591964672333, CurrSamplesPerSec=5.677220163973008, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:58:40,200] [INFO] [timer.py:197:stop] 0/204, RunningAvgSamplesPerSec=6.34056087587368, CurrSamplesPerSec=5.678926379567298, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:58:51,575] [INFO] [timer.py:197:stop] 0/206, RunningAvgSamplesPerSec=6.340353070949561, CurrSamplesPerSec=5.706827947024672, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:59:02,889] [INFO] [timer.py:197:stop] 0/208, RunningAvgSamplesPerSec=6.340482329147856, CurrSamplesPerSec=5.71856619629364, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:59:14,195] [INFO] [timer.py:197:stop] 0/210, RunningAvgSamplesPerSec=6.340559967021615, CurrSamplesPerSec=5.71644504525915, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:59:25,530] [INFO] [timer.py:197:stop] 0/212, RunningAvgSamplesPerSec=6.340570532053464, CurrSamplesPerSec=5.703255954791647, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:59:36,887] [INFO] [timer.py:197:stop] 0/214, RunningAvgSamplesPerSec=6.340519276026067, CurrSamplesPerSec=5.718165909202066, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:59:48,209] [INFO] [timer.py:197:stop] 0/216, RunningAvgSamplesPerSec=6.340597670791518, CurrSamplesPerSec=5.722122268841553, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 12:59:59,540] [INFO] [timer.py:197:stop] 0/218, RunningAvgSamplesPerSec=6.340674298114713, CurrSamplesPerSec=5.7178209714922055, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:00:10,862] [INFO] [logging.py:68:log_dist] [Rank 0] step=110, skipped=3, lr=[7.5191046007362515e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:00:10,863] [INFO] [timer.py:197:stop] 0/220, RunningAvgSamplesPerSec=6.340654507160751, CurrSamplesPerSec=5.698601865846052, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:00:22,219] [INFO] [timer.py:197:stop] 0/222, RunningAvgSamplesPerSec=6.340582477276827, CurrSamplesPerSec=5.696138180415054, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:00:33,530] [INFO] [timer.py:197:stop] 0/224, RunningAvgSamplesPerSec=6.3406193213495925, CurrSamplesPerSec=5.7056717130819, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:00:44,861] [INFO] [timer.py:197:stop] 0/226, RunningAvgSamplesPerSec=6.340673168930113, CurrSamplesPerSec=5.729910043329389, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:00:56,144] [INFO] [timer.py:197:stop] 0/228, RunningAvgSamplesPerSec=6.3409484317239055, CurrSamplesPerSec=5.753290344527706, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:01:07,484] [INFO] [timer.py:197:stop] 0/230, RunningAvgSamplesPerSec=6.34090809825985, CurrSamplesPerSec=5.719477344614187, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:01:18,773] [INFO] [timer.py:197:stop] 0/232, RunningAvgSamplesPerSec=6.341049096424349, CurrSamplesPerSec=5.733209617158469, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:01:30,106] [INFO] [timer.py:197:stop] 0/234, RunningAvgSamplesPerSec=6.341090554053744, CurrSamplesPerSec=5.708298782910727, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:01:41,423] [INFO] [timer.py:197:stop] 0/236, RunningAvgSamplesPerSec=6.3411969087420434, CurrSamplesPerSec=5.719391797881744, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:01:52,772] [INFO] [timer.py:197:stop] 0/238, RunningAvgSamplesPerSec=6.341103710545856, CurrSamplesPerSec=5.686260136951288, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:02:04,087] [INFO] [logging.py:68:log_dist] [Rank 0] step=120, skipped=3, lr=[7.662870867121632e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:02:04,088] [INFO] [timer.py:197:stop] 0/240, RunningAvgSamplesPerSec=6.341185991377369, CurrSamplesPerSec=5.713273198646448, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:02:15,422] [INFO] [timer.py:197:stop] 0/242, RunningAvgSamplesPerSec=6.341224830608527, CurrSamplesPerSec=5.715415603808621, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:02:26,729] [INFO] [timer.py:197:stop] 0/244, RunningAvgSamplesPerSec=6.34135744332114, CurrSamplesPerSec=5.711197284995015, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:02:38,053] [INFO] [timer.py:197:stop] 0/246, RunningAvgSamplesPerSec=6.34141121362421, CurrSamplesPerSec=5.721091269531728, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:02:49,398] [INFO] [timer.py:197:stop] 0/248, RunningAvgSamplesPerSec=6.3413486959959195, CurrSamplesPerSec=5.699164456585958, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:03:00,728] [INFO] [timer.py:197:stop] 0/250, RunningAvgSamplesPerSec=6.341232802134687, CurrSamplesPerSec=5.691354752785703, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1389, 'learning_rate': 7.730207550743121e-06, 'epoch': 0.53} |
|
[2022-12-16 13:03:12,074] [INFO] [timer.py:197:stop] 0/252, RunningAvgSamplesPerSec=6.341191703621708, CurrSamplesPerSec=5.709133078861001, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:03:23,470] [INFO] [timer.py:197:stop] 0/254, RunningAvgSamplesPerSec=6.340809226261937, CurrSamplesPerSec=5.66181732648358, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:03:34,837] [INFO] [timer.py:197:stop] 0/256, RunningAvgSamplesPerSec=6.340656210309537, CurrSamplesPerSec=5.695450750660555, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:03:46,216] [INFO] [timer.py:197:stop] 0/258, RunningAvgSamplesPerSec=6.340446151839456, CurrSamplesPerSec=5.6939783144927985, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:03:57,594] [INFO] [logging.py:68:log_dist] [Rank 0] step=130, skipped=3, lr=[7.794839207460995e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:03:57,596] [INFO] [timer.py:197:stop] 0/260, RunningAvgSamplesPerSec=6.3402530863003035, CurrSamplesPerSec=5.676343794160837, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:04:08,987] [INFO] [timer.py:197:stop] 0/262, RunningAvgSamplesPerSec=6.339991907491181, CurrSamplesPerSec=5.662990020741442, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:04:20,372] [INFO] [timer.py:197:stop] 0/264, RunningAvgSamplesPerSec=6.33977236935012, CurrSamplesPerSec=5.682990304456265, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:04:31,812] [INFO] [timer.py:197:stop] 0/266, RunningAvgSamplesPerSec=6.33927565959383, CurrSamplesPerSec=5.644096081786364, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:04:43,213] [INFO] [timer.py:197:stop] 0/268, RunningAvgSamplesPerSec=6.338978242661073, CurrSamplesPerSec=5.671891216846216, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:04:54,555] [INFO] [timer.py:197:stop] 0/270, RunningAvgSamplesPerSec=6.338812609327249, CurrSamplesPerSec=5.693351540036939, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:05:06,009] [INFO] [timer.py:197:stop] 0/272, RunningAvgSamplesPerSec=6.338416777493823, CurrSamplesPerSec=5.65095011838482, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:05:17,395] [INFO] [timer.py:197:stop] 0/274, RunningAvgSamplesPerSec=6.338115050856076, CurrSamplesPerSec=5.681647924220384, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:05:28,788] [INFO] [timer.py:197:stop] 0/276, RunningAvgSamplesPerSec=6.337804017797665, CurrSamplesPerSec=5.6637430083955795, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:05:40,120] [INFO] [timer.py:197:stop] 0/278, RunningAvgSamplesPerSec=6.337846770383127, CurrSamplesPerSec=5.722553608490977, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:05:51,506] [INFO] [logging.py:68:log_dist] [Rank 0] step=140, skipped=3, lr=[7.916799978227501e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:05:51,508] [INFO] [timer.py:197:stop] 0/280, RunningAvgSamplesPerSec=6.337560045613895, CurrSamplesPerSec=5.677672140398493, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:06:02,886] [INFO] [timer.py:197:stop] 0/282, RunningAvgSamplesPerSec=6.3373140118868525, CurrSamplesPerSec=5.672994234809722, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:06:14,273] [INFO] [timer.py:197:stop] 0/284, RunningAvgSamplesPerSec=6.337108122417633, CurrSamplesPerSec=5.683808312585445, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:06:25,659] [INFO] [timer.py:197:stop] 0/286, RunningAvgSamplesPerSec=6.336910867350083, CurrSamplesPerSec=5.670826723524054, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:06:37,005] [INFO] [timer.py:197:stop] 0/288, RunningAvgSamplesPerSec=6.336887675368015, CurrSamplesPerSec=5.709099323505792, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:06:48,367] [INFO] [timer.py:197:stop] 0/290, RunningAvgSamplesPerSec=6.33672835037139, CurrSamplesPerSec=5.678792064686691, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:06:59,737] [INFO] [timer.py:197:stop] 0/292, RunningAvgSamplesPerSec=6.336600926892866, CurrSamplesPerSec=5.689706184143832, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:07:11,154] [INFO] [timer.py:197:stop] 0/294, RunningAvgSamplesPerSec=6.336275402994384, CurrSamplesPerSec=5.6646066456642465, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:07:22,536] [INFO] [timer.py:197:stop] 0/296, RunningAvgSamplesPerSec=6.336103758210258, CurrSamplesPerSec=5.683237438898976, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:07:33,911] [INFO] [timer.py:197:stop] 0/298, RunningAvgSamplesPerSec=6.335964380155541, CurrSamplesPerSec=5.690570278821041, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:07:45,241] [INFO] [logging.py:68:log_dist] [Rank 0] step=150, skipped=3, lr=[8.03016458599496e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:07:45,243] [INFO] [timer.py:197:stop] 0/300, RunningAvgSamplesPerSec=6.335939475961018, CurrSamplesPerSec=5.704075201910128, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1376, 'learning_rate': 8.03016458599496e-06, 'epoch': 0.64} |
|
[2022-12-16 13:07:56,649] [INFO] [timer.py:197:stop] 0/302, RunningAvgSamplesPerSec=6.335681670539102, CurrSamplesPerSec=5.672674624718876, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:08:08,021] [INFO] [timer.py:197:stop] 0/304, RunningAvgSamplesPerSec=6.335556580135721, CurrSamplesPerSec=5.6922780112891544, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:08:19,398] [INFO] [timer.py:197:stop] 0/306, RunningAvgSamplesPerSec=6.335409090735576, CurrSamplesPerSec=5.698753088999794, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:08:30,775] [INFO] [timer.py:197:stop] 0/308, RunningAvgSamplesPerSec=6.335265510045217, CurrSamplesPerSec=5.695196269706352, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:08:42,145] [INFO] [timer.py:197:stop] 0/310, RunningAvgSamplesPerSec=6.3351501550714735, CurrSamplesPerSec=5.7040398094490365, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:08:53,530] [INFO] [timer.py:197:stop] 0/312, RunningAvgSamplesPerSec=6.33497950697826, CurrSamplesPerSec=5.686301090884195, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:09:04,886] [INFO] [timer.py:197:stop] 0/314, RunningAvgSamplesPerSec=6.334928011074155, CurrSamplesPerSec=5.684328504353401, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:09:16,247] [INFO] [timer.py:197:stop] 0/316, RunningAvgSamplesPerSec=6.334859913066805, CurrSamplesPerSec=5.694762278685135, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:09:27,593] [INFO] [timer.py:197:stop] 0/318, RunningAvgSamplesPerSec=6.334788013895496, CurrSamplesPerSec=5.692994618031218, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:09:38,950] [INFO] [logging.py:68:log_dist] [Rank 0] step=160, skipped=3, lr=[8.136065420813943e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:09:38,952] [INFO] [timer.py:197:stop] 0/320, RunningAvgSamplesPerSec=6.334726194202973, CurrSamplesPerSec=5.69166705821747, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:09:50,346] [INFO] [timer.py:197:stop] 0/322, RunningAvgSamplesPerSec=6.334528318138216, CurrSamplesPerSec=5.664160810534791, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:10:01,694] [INFO] [timer.py:197:stop] 0/324, RunningAvgSamplesPerSec=6.334444913892337, CurrSamplesPerSec=5.695642170124778, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:10:13,030] [INFO] [timer.py:197:stop] 0/326, RunningAvgSamplesPerSec=6.334481645980198, CurrSamplesPerSec=5.728451521955659, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:10:24,375] [INFO] [timer.py:197:stop] 0/328, RunningAvgSamplesPerSec=6.334359224662135, CurrSamplesPerSec=5.693915993166506, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:10:35,730] [INFO] [timer.py:197:stop] 0/330, RunningAvgSamplesPerSec=6.334316250489746, CurrSamplesPerSec=5.70256511006069, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:10:47,124] [INFO] [timer.py:197:stop] 0/332, RunningAvgSamplesPerSec=6.334127316131262, CurrSamplesPerSec=5.681846113935855, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:10:58,487] [INFO] [timer.py:197:stop] 0/334, RunningAvgSamplesPerSec=6.334067896845867, CurrSamplesPerSec=5.688566279869823, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:11:09,868] [INFO] [timer.py:197:stop] 0/336, RunningAvgSamplesPerSec=6.33393047819036, CurrSamplesPerSec=5.69602673978007, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:11:21,244] [INFO] [timer.py:197:stop] 0/338, RunningAvgSamplesPerSec=6.33380854480409, CurrSamplesPerSec=5.695747311289742, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:11:32,639] [INFO] [logging.py:68:log_dist] [Rank 0] step=170, skipped=3, lr=[8.235424875329062e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:11:32,641] [INFO] [timer.py:197:stop] 0/340, RunningAvgSamplesPerSec=6.333611534013795, CurrSamplesPerSec=5.6619486899466605, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:11:44,035] [INFO] [timer.py:197:stop] 0/342, RunningAvgSamplesPerSec=6.333436692059929, CurrSamplesPerSec=5.672640100352739, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:11:55,425] [INFO] [timer.py:197:stop] 0/344, RunningAvgSamplesPerSec=6.333279569434337, CurrSamplesPerSec=5.672706032667005, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:12:06,796] [INFO] [timer.py:197:stop] 0/346, RunningAvgSamplesPerSec=6.333207104012073, CurrSamplesPerSec=5.714714753968254, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:12:18,158] [INFO] [timer.py:197:stop] 0/348, RunningAvgSamplesPerSec=6.333079636533842, CurrSamplesPerSec=5.695123047206334, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:12:29,545] [INFO] [timer.py:197:stop] 0/350, RunningAvgSamplesPerSec=6.332933225418649, CurrSamplesPerSec=5.690190546602886, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1374, 'learning_rate': 8.282894746203441e-06, 'epoch': 0.74} |
|
[2022-12-16 13:12:40,934] [INFO] [timer.py:197:stop] 0/352, RunningAvgSamplesPerSec=6.3327789258231455, CurrSamplesPerSec=5.684669172023788, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:12:52,331] [INFO] [timer.py:197:stop] 0/354, RunningAvgSamplesPerSec=6.332533138818452, CurrSamplesPerSec=5.640574665860056, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:13:03,731] [INFO] [timer.py:197:stop] 0/356, RunningAvgSamplesPerSec=6.332345649627949, CurrSamplesPerSec=5.663396002221176, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:13:15,103] [INFO] [timer.py:197:stop] 0/358, RunningAvgSamplesPerSec=6.3322621358120035, CurrSamplesPerSec=5.684914285399642, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:13:26,447] [INFO] [logging.py:68:log_dist] [Rank 0] step=180, skipped=3, lr=[8.329004259959669e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:13:26,449] [INFO] [timer.py:197:stop] 0/360, RunningAvgSamplesPerSec=6.332159145240631, CurrSamplesPerSec=5.67905349198733, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:13:37,820] [INFO] [timer.py:197:stop] 0/362, RunningAvgSamplesPerSec=6.332024007287478, CurrSamplesPerSec=5.678510240339759, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:13:49,169] [INFO] [timer.py:197:stop] 0/364, RunningAvgSamplesPerSec=6.331968726588887, CurrSamplesPerSec=5.682525210969754, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:14:00,560] [INFO] [timer.py:197:stop] 0/366, RunningAvgSamplesPerSec=6.331832648381136, CurrSamplesPerSec=5.684769092901, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:14:11,953] [INFO] [timer.py:197:stop] 0/368, RunningAvgSamplesPerSec=6.331687391665875, CurrSamplesPerSec=5.67988911544788, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:14:23,301] [INFO] [timer.py:197:stop] 0/370, RunningAvgSamplesPerSec=6.3316890358546, CurrSamplesPerSec=5.702466985398981, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:14:34,670] [INFO] [timer.py:197:stop] 0/372, RunningAvgSamplesPerSec=6.331570795387366, CurrSamplesPerSec=5.695797345345154, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:14:46,028] [INFO] [timer.py:197:stop] 0/374, RunningAvgSamplesPerSec=6.331543357774669, CurrSamplesPerSec=5.711262415461503, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:14:57,328] [INFO] [timer.py:197:stop] 0/376, RunningAvgSamplesPerSec=6.331652133552371, CurrSamplesPerSec=5.7202161751546035, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:15:08,672] [INFO] [timer.py:197:stop] 0/378, RunningAvgSamplesPerSec=6.33161517038932, CurrSamplesPerSec=5.693738698870023, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:15:20,018] [INFO] [logging.py:68:log_dist] [Rank 0] step=190, skipped=3, lr=[8.417439256037237e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:15:20,019] [INFO] [timer.py:197:stop] 0/380, RunningAvgSamplesPerSec=6.331634947341557, CurrSamplesPerSec=5.706364766431756, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:15:31,375] [INFO] [timer.py:197:stop] 0/382, RunningAvgSamplesPerSec=6.331554282858503, CurrSamplesPerSec=5.694370148810471, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:15:42,826] [INFO] [timer.py:197:stop] 0/384, RunningAvgSamplesPerSec=6.331223208258476, CurrSamplesPerSec=5.598517748356675, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:15:54,187] [INFO] [timer.py:197:stop] 0/386, RunningAvgSamplesPerSec=6.33117774424811, CurrSamplesPerSec=5.698536781871162, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:16:05,614] [INFO] [timer.py:197:stop] 0/388, RunningAvgSamplesPerSec=6.331056018601255, CurrSamplesPerSec=5.66092851427846, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:16:16,986] [INFO] [timer.py:197:stop] 0/390, RunningAvgSamplesPerSec=6.3309740768420495, CurrSamplesPerSec=5.668828223825785, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:16:28,308] [INFO] [timer.py:197:stop] 0/392, RunningAvgSamplesPerSec=6.331003137668457, CurrSamplesPerSec=5.705271046407015, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:16:39,640] [INFO] [timer.py:197:stop] 0/394, RunningAvgSamplesPerSec=6.330996505918037, CurrSamplesPerSec=5.6957204817960045, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:16:50,988] [INFO] [timer.py:197:stop] 0/396, RunningAvgSamplesPerSec=6.33099225627066, CurrSamplesPerSec=5.69033022619594, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:17:02,323] [INFO] [timer.py:197:stop] 0/398, RunningAvgSamplesPerSec=6.331037308014052, CurrSamplesPerSec=5.711416741713082, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:17:13,685] [INFO] [logging.py:68:log_dist] [Rank 0] step=200, skipped=3, lr=[8.501266121799902e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:17:13,687] [INFO] [timer.py:197:stop] 0/400, RunningAvgSamplesPerSec=6.330927735043576, CurrSamplesPerSec=5.676714717671288, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1287, 'learning_rate': 8.501266121799902e-06, 'epoch': 0.85} |
|
[2022-12-16 13:17:25,016] [INFO] [timer.py:197:stop] 0/402, RunningAvgSamplesPerSec=6.330933021275755, CurrSamplesPerSec=5.702418045471839, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:17:36,361] [INFO] [timer.py:197:stop] 0/404, RunningAvgSamplesPerSec=6.330935328095778, CurrSamplesPerSec=5.709033271015826, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:17:47,092] [INFO] [stage_1_and_2.py:1765:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0 |
|
[2022-12-16 13:17:47,094] [INFO] [timer.py:197:stop] 0/406, RunningAvgSamplesPerSec=6.332789283396464, CurrSamplesPerSec=6.394784222406185, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:17:58,451] [INFO] [timer.py:197:stop] 0/408, RunningAvgSamplesPerSec=6.3327465412974275, CurrSamplesPerSec=5.706926464484751, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:18:09,815] [INFO] [timer.py:197:stop] 0/410, RunningAvgSamplesPerSec=6.332696453161811, CurrSamplesPerSec=5.677898635990999, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:18:21,204] [INFO] [timer.py:197:stop] 0/412, RunningAvgSamplesPerSec=6.3325963497900535, CurrSamplesPerSec=5.681225614591862, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:18:32,608] [INFO] [timer.py:197:stop] 0/414, RunningAvgSamplesPerSec=6.332589791376257, CurrSamplesPerSec=5.693203259831853, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:18:43,964] [INFO] [timer.py:197:stop] 0/416, RunningAvgSamplesPerSec=6.332529284688327, CurrSamplesPerSec=5.675976039549646, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:18:55,323] [INFO] [timer.py:197:stop] 0/418, RunningAvgSamplesPerSec=6.332541567747686, CurrSamplesPerSec=5.696589063934239, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:19:06,712] [INFO] [logging.py:68:log_dist] [Rank 0] step=210, skipped=4, lr=[8.573149077803088e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:19:06,713] [INFO] [timer.py:197:stop] 0/420, RunningAvgSamplesPerSec=6.332525376024386, CurrSamplesPerSec=5.685732365818978, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:19:18,077] [INFO] [timer.py:197:stop] 0/422, RunningAvgSamplesPerSec=6.332480645100248, CurrSamplesPerSec=5.688045312222482, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:19:29,426] [INFO] [timer.py:197:stop] 0/424, RunningAvgSamplesPerSec=6.332436469764684, CurrSamplesPerSec=5.694331252857661, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:19:40,762] [INFO] [timer.py:197:stop] 0/426, RunningAvgSamplesPerSec=6.332425390489938, CurrSamplesPerSec=5.710604131472058, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:19:52,108] [INFO] [timer.py:197:stop] 0/428, RunningAvgSamplesPerSec=6.33234514899996, CurrSamplesPerSec=5.6711051498664125, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:20:03,460] [INFO] [timer.py:197:stop] 0/430, RunningAvgSamplesPerSec=6.332341452878609, CurrSamplesPerSec=5.702187166947249, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:20:14,798] [INFO] [timer.py:197:stop] 0/432, RunningAvgSamplesPerSec=6.3323660481354285, CurrSamplesPerSec=5.70455498277639, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:20:26,129] [INFO] [timer.py:197:stop] 0/434, RunningAvgSamplesPerSec=6.332410466165831, CurrSamplesPerSec=5.709264704274692, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:20:37,449] [INFO] [timer.py:197:stop] 0/436, RunningAvgSamplesPerSec=6.332434604656064, CurrSamplesPerSec=5.724072597754097, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:20:48,804] [INFO] [timer.py:197:stop] 0/438, RunningAvgSamplesPerSec=6.332414739569677, CurrSamplesPerSec=5.7062907712597895, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:21:00,111] [INFO] [logging.py:68:log_dist] [Rank 0] step=220, skipped=4, lr=[8.64942458567722e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:21:00,113] [INFO] [timer.py:197:stop] 0/440, RunningAvgSamplesPerSec=6.332536560073241, CurrSamplesPerSec=5.718785488696217, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:21:11,432] [INFO] [timer.py:197:stop] 0/442, RunningAvgSamplesPerSec=6.332569559767877, CurrSamplesPerSec=5.701313970796222, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:21:22,769] [INFO] [timer.py:197:stop] 0/444, RunningAvgSamplesPerSec=6.3326005766264215, CurrSamplesPerSec=5.708380841954919, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:21:34,090] [INFO] [timer.py:197:stop] 0/446, RunningAvgSamplesPerSec=6.3326758013028215, CurrSamplesPerSec=5.7286749963346875, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:21:45,409] [INFO] [timer.py:197:stop] 0/448, RunningAvgSamplesPerSec=6.33271448435985, CurrSamplesPerSec=5.709920007439783, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:21:56,742] [INFO] [timer.py:197:stop] 0/450, RunningAvgSamplesPerSec=6.3327607879641326, CurrSamplesPerSec=5.704823879601781, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.1225, 'learning_rate': 8.686247975778677e-06, 'epoch': 0.95} |
|
[2022-12-16 13:22:08,082] [INFO] [timer.py:197:stop] 0/452, RunningAvgSamplesPerSec=6.332790148344536, CurrSamplesPerSec=5.718563272508554, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:22:19,449] [INFO] [timer.py:197:stop] 0/454, RunningAvgSamplesPerSec=6.3327388875050685, CurrSamplesPerSec=5.680145595658538, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:22:30,809] [INFO] [timer.py:197:stop] 0/456, RunningAvgSamplesPerSec=6.332711167657343, CurrSamplesPerSec=5.712535674159144, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:22:42,108] [INFO] [timer.py:197:stop] 0/458, RunningAvgSamplesPerSec=6.33280234507031, CurrSamplesPerSec=5.7229347443492955, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:22:53,449] [INFO] [logging.py:68:log_dist] [Rank 0] step=230, skipped=4, lr=[8.722247506883805e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:22:53,450] [INFO] [timer.py:197:stop] 0/460, RunningAvgSamplesPerSec=6.3328301209678255, CurrSamplesPerSec=5.712547830948624, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:23:04,788] [INFO] [timer.py:197:stop] 0/462, RunningAvgSamplesPerSec=6.332812979171486, CurrSamplesPerSec=5.6831748711853285, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:23:16,134] [INFO] [timer.py:197:stop] 0/464, RunningAvgSamplesPerSec=6.332817304905055, CurrSamplesPerSec=5.701674358874692, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:23:26,839] [INFO] [stage_1_and_2.py:1765:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 |
|
[2022-12-16 13:23:26,841] [INFO] [timer.py:197:stop] 0/466, RunningAvgSamplesPerSec=6.334509821736817, CurrSamplesPerSec=6.410038609411664, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:23:38,196] [INFO] [timer.py:197:stop] 0/468, RunningAvgSamplesPerSec=6.33448448174401, CurrSamplesPerSec=5.692148857768002, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:23:49,581] [INFO] [timer.py:197:stop] 0/470, RunningAvgSamplesPerSec=6.334334324008184, CurrSamplesPerSec=5.6524056189787935, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:23:58,075] [INFO] [timer.py:197:stop] 0/472, RunningAvgSamplesPerSec=6.341034963749929, CurrSamplesPerSec=10.244828838544047, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:24:09,412] [INFO] [timer.py:197:stop] 0/474, RunningAvgSamplesPerSec=6.341031354651321, CurrSamplesPerSec=5.709517043762645, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:24:20,810] [INFO] [timer.py:197:stop] 0/476, RunningAvgSamplesPerSec=6.340830579777299, CurrSamplesPerSec=5.65173441535856, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:24:32,186] [INFO] [timer.py:197:stop] 0/478, RunningAvgSamplesPerSec=6.34072547922669, CurrSamplesPerSec=5.666355761890989, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:24:43,527] [INFO] [logging.py:68:log_dist] [Rank 0] step=240, skipped=5, lr=[8.785084156039184e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:24:43,529] [INFO] [timer.py:197:stop] 0/480, RunningAvgSamplesPerSec=6.340705507213309, CurrSamplesPerSec=5.708322574935123, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:24:54,875] [INFO] [timer.py:197:stop] 0/482, RunningAvgSamplesPerSec=6.340681063176589, CurrSamplesPerSec=5.698540411049955, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:25:06,236] [INFO] [timer.py:197:stop] 0/484, RunningAvgSamplesPerSec=6.340634152728032, CurrSamplesPerSec=5.68413736313393, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:25:17,613] [INFO] [timer.py:197:stop] 0/486, RunningAvgSamplesPerSec=6.340556261832466, CurrSamplesPerSec=5.685238647671385, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:25:28,931] [INFO] [timer.py:197:stop] 0/488, RunningAvgSamplesPerSec=6.340556874315241, CurrSamplesPerSec=5.725152048399553, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:25:40,233] [INFO] [timer.py:197:stop] 0/490, RunningAvgSamplesPerSec=6.3406058880671585, CurrSamplesPerSec=5.7148970070583704, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:25:51,607] [INFO] [timer.py:197:stop] 0/492, RunningAvgSamplesPerSec=6.340504282447385, CurrSamplesPerSec=5.659161271286198, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:26:02,916] [INFO] [timer.py:197:stop] 0/494, RunningAvgSamplesPerSec=6.340538253413768, CurrSamplesPerSec=5.714366826410801, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:26:14,293] [INFO] [timer.py:197:stop] 0/496, RunningAvgSamplesPerSec=6.34043777677742, CurrSamplesPerSec=5.673726380737918, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:26:25,621] [INFO] [timer.py:197:stop] 0/498, RunningAvgSamplesPerSec=6.340464247689892, CurrSamplesPerSec=5.718628814741922, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:26:36,937] [INFO] [logging.py:68:log_dist] [Rank 0] step=250, skipped=5, lr=[8.852140188761744e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:26:36,939] [INFO] [timer.py:197:stop] 0/500, RunningAvgSamplesPerSec=6.340507097510456, CurrSamplesPerSec=5.714383370258332, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0911, 'learning_rate': 8.852140188761744e-06, 'epoch': 1.06} |
|
[2022-12-16 13:26:48,258] [INFO] [timer.py:197:stop] 0/502, RunningAvgSamplesPerSec=6.340515051225799, CurrSamplesPerSec=5.708866203892745, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:26:59,598] [INFO] [timer.py:197:stop] 0/504, RunningAvgSamplesPerSec=6.340501513927871, CurrSamplesPerSec=5.696083789104848, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:27:10,938] [INFO] [timer.py:197:stop] 0/506, RunningAvgSamplesPerSec=6.340491594057469, CurrSamplesPerSec=5.696348986773738, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:27:22,273] [INFO] [timer.py:197:stop] 0/508, RunningAvgSamplesPerSec=6.340492785987803, CurrSamplesPerSec=5.7040514452781945, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:27:33,596] [INFO] [timer.py:197:stop] 0/510, RunningAvgSamplesPerSec=6.34044975201933, CurrSamplesPerSec=5.6719408326517975, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:27:44,949] [INFO] [timer.py:197:stop] 0/512, RunningAvgSamplesPerSec=6.340408872057434, CurrSamplesPerSec=5.685037331614662, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:27:56,297] [INFO] [timer.py:197:stop] 0/514, RunningAvgSamplesPerSec=6.340377687654657, CurrSamplesPerSec=5.704021628561016, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:28:07,656] [INFO] [timer.py:197:stop] 0/516, RunningAvgSamplesPerSec=6.340319751692764, CurrSamplesPerSec=5.679723028624139, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:28:18,978] [INFO] [timer.py:197:stop] 0/518, RunningAvgSamplesPerSec=6.340350792432082, CurrSamplesPerSec=5.7098451913181485, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:28:30,309] [INFO] [logging.py:68:log_dist] [Rank 0] step=260, skipped=5, lr=[8.916513249749862e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:28:30,311] [INFO] [timer.py:197:stop] 0/520, RunningAvgSamplesPerSec=6.340314121727346, CurrSamplesPerSec=5.703653187538543, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:28:41,680] [INFO] [timer.py:197:stop] 0/522, RunningAvgSamplesPerSec=6.340257665480462, CurrSamplesPerSec=5.697301194271666, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:28:53,006] [INFO] [timer.py:197:stop] 0/524, RunningAvgSamplesPerSec=6.340306276076679, CurrSamplesPerSec=5.723158032534116, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:29:04,397] [INFO] [timer.py:197:stop] 0/526, RunningAvgSamplesPerSec=6.340176406507254, CurrSamplesPerSec=5.6733251519178785, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:29:15,729] [INFO] [timer.py:197:stop] 0/528, RunningAvgSamplesPerSec=6.340154943695952, CurrSamplesPerSec=5.701712386349338, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:29:27,011] [INFO] [timer.py:197:stop] 0/530, RunningAvgSamplesPerSec=6.3402431766411205, CurrSamplesPerSec=5.734707806521304, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:29:38,377] [INFO] [timer.py:197:stop] 0/532, RunningAvgSamplesPerSec=6.3401683159730045, CurrSamplesPerSec=5.680019395846795, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:29:49,771] [INFO] [timer.py:197:stop] 0/534, RunningAvgSamplesPerSec=6.34005297999423, CurrSamplesPerSec=5.670300615513904, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:30:01,082] [INFO] [timer.py:197:stop] 0/536, RunningAvgSamplesPerSec=6.340096536081388, CurrSamplesPerSec=5.722985013174985, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:30:12,416] [INFO] [timer.py:197:stop] 0/538, RunningAvgSamplesPerSec=6.340091748500424, CurrSamplesPerSec=5.710884290641021, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:30:23,758] [INFO] [logging.py:68:log_dist] [Rank 0] step=270, skipped=5, lr=[8.978409800937961e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:30:23,759] [INFO] [timer.py:197:stop] 0/540, RunningAvgSamplesPerSec=6.3400577102176525, CurrSamplesPerSec=5.694194275689367, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:30:35,096] [INFO] [timer.py:197:stop] 0/542, RunningAvgSamplesPerSec=6.34004129781036, CurrSamplesPerSec=5.695970658506859, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:30:46,447] [INFO] [timer.py:197:stop] 0/544, RunningAvgSamplesPerSec=6.340026029384895, CurrSamplesPerSec=5.702012506319923, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:30:57,794] [INFO] [timer.py:197:stop] 0/546, RunningAvgSamplesPerSec=6.3400386927289, CurrSamplesPerSec=5.706275244647619, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:31:09,140] [INFO] [timer.py:197:stop] 0/548, RunningAvgSamplesPerSec=6.34005752347985, CurrSamplesPerSec=5.710305535873723, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:31:20,489] [INFO] [timer.py:197:stop] 0/550, RunningAvgSamplesPerSec=6.340052862396717, CurrSamplesPerSec=5.700895028486481, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0701, 'learning_rate': 9.00848753507038e-06, 'epoch': 1.17} |
|
[2022-12-16 13:31:31,787] [INFO] [timer.py:197:stop] 0/552, RunningAvgSamplesPerSec=6.340167350310263, CurrSamplesPerSec=5.770476218524265, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:31:43,153] [INFO] [timer.py:197:stop] 0/554, RunningAvgSamplesPerSec=6.340122552433611, CurrSamplesPerSec=5.6745541993997355, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:31:54,491] [INFO] [timer.py:197:stop] 0/556, RunningAvgSamplesPerSec=6.340139825646102, CurrSamplesPerSec=5.708953135855733, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:32:05,850] [INFO] [timer.py:197:stop] 0/558, RunningAvgSamplesPerSec=6.340071243133932, CurrSamplesPerSec=5.691814534287232, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:32:17,195] [INFO] [logging.py:68:log_dist] [Rank 0] step=280, skipped=5, lr=[9.038013352913754e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:32:17,197] [INFO] [timer.py:197:stop] 0/560, RunningAvgSamplesPerSec=6.340015870411556, CurrSamplesPerSec=5.687219336567981, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:32:28,543] [INFO] [timer.py:197:stop] 0/562, RunningAvgSamplesPerSec=6.340002215824556, CurrSamplesPerSec=5.7009136737425115, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:32:39,909] [INFO] [timer.py:197:stop] 0/564, RunningAvgSamplesPerSec=6.339945455396239, CurrSamplesPerSec=5.670917772159438, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:32:51,253] [INFO] [timer.py:197:stop] 0/566, RunningAvgSamplesPerSec=6.339925029302221, CurrSamplesPerSec=5.697719124776133, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:33:02,598] [INFO] [timer.py:197:stop] 0/568, RunningAvgSamplesPerSec=6.339901880491667, CurrSamplesPerSec=5.690123242304234, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:33:13,971] [INFO] [timer.py:197:stop] 0/570, RunningAvgSamplesPerSec=6.339801948583433, CurrSamplesPerSec=5.675292747439151, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:33:25,288] [INFO] [timer.py:197:stop] 0/572, RunningAvgSamplesPerSec=6.339862092630734, CurrSamplesPerSec=5.711445663623603, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:33:36,663] [INFO] [timer.py:197:stop] 0/574, RunningAvgSamplesPerSec=6.339783864658929, CurrSamplesPerSec=5.6676619658311305, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:33:48,016] [INFO] [timer.py:197:stop] 0/576, RunningAvgSamplesPerSec=6.339745950695436, CurrSamplesPerSec=5.689556646658774, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:33:59,343] [INFO] [timer.py:197:stop] 0/578, RunningAvgSamplesPerSec=6.339769267297791, CurrSamplesPerSec=5.712568254471431, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:34:10,697] [INFO] [logging.py:68:log_dist] [Rank 0] step=290, skipped=5, lr=[9.095487745564754e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:34:10,699] [INFO] [timer.py:197:stop] 0/580, RunningAvgSamplesPerSec=6.339728316436596, CurrSamplesPerSec=5.70156197490803, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:34:22,038] [INFO] [timer.py:197:stop] 0/582, RunningAvgSamplesPerSec=6.339692731446194, CurrSamplesPerSec=5.697978911092788, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:34:33,374] [INFO] [timer.py:197:stop] 0/584, RunningAvgSamplesPerSec=6.339692650355827, CurrSamplesPerSec=5.7080191201038675, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:34:44,700] [INFO] [timer.py:197:stop] 0/586, RunningAvgSamplesPerSec=6.339714029901691, CurrSamplesPerSec=5.718308183769478, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:34:56,048] [INFO] [timer.py:197:stop] 0/588, RunningAvgSamplesPerSec=6.339682913023013, CurrSamplesPerSec=5.709339262321484, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:35:07,358] [INFO] [timer.py:197:stop] 0/590, RunningAvgSamplesPerSec=6.339736119920085, CurrSamplesPerSec=5.729573470814689, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:35:18,701] [INFO] [timer.py:197:stop] 0/592, RunningAvgSamplesPerSec=6.339704018750019, CurrSamplesPerSec=5.694426440167608, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:35:30,045] [INFO] [timer.py:197:stop] 0/594, RunningAvgSamplesPerSec=6.3396871452533015, CurrSamplesPerSec=5.697992457397849, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:35:41,402] [INFO] [timer.py:197:stop] 0/596, RunningAvgSamplesPerSec=6.339643448245747, CurrSamplesPerSec=5.680999815328403, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:35:52,674] [INFO] [timer.py:197:stop] 0/598, RunningAvgSamplesPerSec=6.339779092319611, CurrSamplesPerSec=5.7476326050363165, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:36:03,998] [INFO] [logging.py:68:log_dist] [Rank 0] step=300, skipped=5, lr=[9.150979862726452e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:36:04,000] [INFO] [timer.py:197:stop] 0/600, RunningAvgSamplesPerSec=6.339731948606691, CurrSamplesPerSec=5.685480679268439, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.068, 'learning_rate': 9.150979862726452e-06, 'epoch': 1.27} |
|
[2022-12-16 13:36:15,345] [INFO] [timer.py:197:stop] 0/602, RunningAvgSamplesPerSec=6.339726669949757, CurrSamplesPerSec=5.694057788240676, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:36:26,664] [INFO] [timer.py:197:stop] 0/604, RunningAvgSamplesPerSec=6.339747213370603, CurrSamplesPerSec=5.705463611623002, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:36:37,987] [INFO] [timer.py:197:stop] 0/606, RunningAvgSamplesPerSec=6.339748462522443, CurrSamplesPerSec=5.70420465478969, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:36:49,282] [INFO] [timer.py:197:stop] 0/608, RunningAvgSamplesPerSec=6.3397972823506645, CurrSamplesPerSec=5.703827706300002, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:37:00,638] [INFO] [timer.py:197:stop] 0/610, RunningAvgSamplesPerSec=6.339776098051399, CurrSamplesPerSec=5.684790522173071, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:37:12,026] [INFO] [timer.py:197:stop] 0/612, RunningAvgSamplesPerSec=6.339667566301425, CurrSamplesPerSec=5.709359905773123, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:37:23,357] [INFO] [timer.py:197:stop] 0/614, RunningAvgSamplesPerSec=6.339668227082106, CurrSamplesPerSec=5.71898944572501, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:37:34,717] [INFO] [timer.py:197:stop] 0/616, RunningAvgSamplesPerSec=6.339607910614529, CurrSamplesPerSec=5.684605850491065, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:37:46,081] [INFO] [timer.py:197:stop] 0/618, RunningAvgSamplesPerSec=6.3395661909756, CurrSamplesPerSec=5.7021224855506185, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:37:57,317] [INFO] [logging.py:68:log_dist] [Rank 0] step=310, skipped=5, lr=[9.204621894113846e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:37:57,319] [INFO] [timer.py:197:stop] 0/620, RunningAvgSamplesPerSec=6.339629943446628, CurrSamplesPerSec=5.731616506771275, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:38:08,653] [INFO] [timer.py:197:stop] 0/622, RunningAvgSamplesPerSec=6.339628430644952, CurrSamplesPerSec=5.696671512017795, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:38:19,973] [INFO] [timer.py:197:stop] 0/624, RunningAvgSamplesPerSec=6.33967461194956, CurrSamplesPerSec=5.721700019089708, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:38:31,257] [INFO] [timer.py:197:stop] 0/626, RunningAvgSamplesPerSec=6.339773691276162, CurrSamplesPerSec=5.75074564508181, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:38:42,544] [INFO] [timer.py:197:stop] 0/628, RunningAvgSamplesPerSec=6.3398743535812185, CurrSamplesPerSec=5.729527488686639, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:38:53,868] [INFO] [timer.py:197:stop] 0/630, RunningAvgSamplesPerSec=6.339870442560419, CurrSamplesPerSec=5.700656767380682, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:39:05,212] [INFO] [timer.py:197:stop] 0/632, RunningAvgSamplesPerSec=6.339829317517667, CurrSamplesPerSec=5.68684029237345, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:39:16,558] [INFO] [timer.py:197:stop] 0/634, RunningAvgSamplesPerSec=6.339819890170821, CurrSamplesPerSec=5.692244696362038, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:39:27,870] [INFO] [timer.py:197:stop] 0/636, RunningAvgSamplesPerSec=6.339825679711417, CurrSamplesPerSec=5.71962529044465, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:39:39,187] [INFO] [timer.py:197:stop] 0/638, RunningAvgSamplesPerSec=6.339859209258358, CurrSamplesPerSec=5.729349926332938, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:39:50,542] [INFO] [logging.py:68:log_dist] [Rank 0] step=320, skipped=5, lr=[9.256533232218034e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:39:50,543] [INFO] [timer.py:197:stop] 0/640, RunningAvgSamplesPerSec=6.339814401453925, CurrSamplesPerSec=5.71844510536356, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:40:01,872] [INFO] [timer.py:197:stop] 0/642, RunningAvgSamplesPerSec=6.339826688723115, CurrSamplesPerSec=5.720280048717393, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:40:13,247] [INFO] [timer.py:197:stop] 0/644, RunningAvgSamplesPerSec=6.339761846868838, CurrSamplesPerSec=5.681088545805493, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:40:24,576] [INFO] [timer.py:197:stop] 0/646, RunningAvgSamplesPerSec=6.3397561426574445, CurrSamplesPerSec=5.706065158483307, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:40:35,928] [INFO] [timer.py:197:stop] 0/648, RunningAvgSamplesPerSec=6.339725988395178, CurrSamplesPerSec=5.689788915516862, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:40:47,268] [INFO] [timer.py:197:stop] 0/650, RunningAvgSamplesPerSec=6.339717893553607, CurrSamplesPerSec=5.6690195340352, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0673, 'learning_rate': 9.281874101213678e-06, 'epoch': 1.38} |
|
[2022-12-16 13:40:58,588] [INFO] [timer.py:197:stop] 0/652, RunningAvgSamplesPerSec=6.339732138996129, CurrSamplesPerSec=5.710183336916263, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:41:09,932] [INFO] [timer.py:197:stop] 0/654, RunningAvgSamplesPerSec=6.339713588073108, CurrSamplesPerSec=5.698703244961602, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:41:21,251] [INFO] [timer.py:197:stop] 0/656, RunningAvgSamplesPerSec=6.3397462039918056, CurrSamplesPerSec=5.704568317909728, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:41:32,597] [INFO] [timer.py:197:stop] 0/658, RunningAvgSamplesPerSec=6.339728329434604, CurrSamplesPerSec=5.688525052206091, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:41:43,927] [INFO] [logging.py:68:log_dist] [Rank 0] step=330, skipped=5, lr=[9.306822072655195e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:41:43,928] [INFO] [timer.py:197:stop] 0/660, RunningAvgSamplesPerSec=6.339748008707906, CurrSamplesPerSec=5.718520634315761, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:41:55,285] [INFO] [timer.py:197:stop] 0/662, RunningAvgSamplesPerSec=6.339712410854758, CurrSamplesPerSec=5.693721308198521, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:42:06,580] [INFO] [timer.py:197:stop] 0/664, RunningAvgSamplesPerSec=6.339760320122381, CurrSamplesPerSec=5.715314115949173, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:42:17,869] [INFO] [timer.py:197:stop] 0/666, RunningAvgSamplesPerSec=6.339852918004305, CurrSamplesPerSec=5.742691326057452, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:42:29,166] [INFO] [timer.py:197:stop] 0/668, RunningAvgSamplesPerSec=6.3399289149016615, CurrSamplesPerSec=5.718510644888239, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:42:40,498] [INFO] [timer.py:197:stop] 0/670, RunningAvgSamplesPerSec=6.33993930819201, CurrSamplesPerSec=5.716523930033197, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:42:51,850] [INFO] [timer.py:197:stop] 0/672, RunningAvgSamplesPerSec=6.3399087697760335, CurrSamplesPerSec=5.702439850286114, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:43:03,189] [INFO] [timer.py:197:stop] 0/674, RunningAvgSamplesPerSec=6.339904910312488, CurrSamplesPerSec=5.697196479393878, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:43:14,515] [INFO] [timer.py:197:stop] 0/676, RunningAvgSamplesPerSec=6.339919649025721, CurrSamplesPerSec=5.707675162721676, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:43:25,854] [INFO] [timer.py:197:stop] 0/678, RunningAvgSamplesPerSec=6.339914916778764, CurrSamplesPerSec=5.711340914229161, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:43:37,180] [INFO] [logging.py:68:log_dist] [Rank 0] step=340, skipped=5, lr=[9.355586771917604e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:43:37,182] [INFO] [timer.py:197:stop] 0/680, RunningAvgSamplesPerSec=6.3399149903217955, CurrSamplesPerSec=5.695608574111107, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:43:48,529] [INFO] [timer.py:197:stop] 0/682, RunningAvgSamplesPerSec=6.339897700238457, CurrSamplesPerSec=5.704236412854308, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:43:59,879] [INFO] [timer.py:197:stop] 0/684, RunningAvgSamplesPerSec=6.339888606119401, CurrSamplesPerSec=5.7055628096091775, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:44:11,185] [INFO] [timer.py:197:stop] 0/686, RunningAvgSamplesPerSec=6.339917140932174, CurrSamplesPerSec=5.706974511179494, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:44:22,550] [INFO] [timer.py:197:stop] 0/688, RunningAvgSamplesPerSec=6.339882372446429, CurrSamplesPerSec=5.692362024576135, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:44:33,878] [INFO] [timer.py:197:stop] 0/690, RunningAvgSamplesPerSec=6.339900812437982, CurrSamplesPerSec=5.707581716395236, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:44:45,202] [INFO] [timer.py:197:stop] 0/692, RunningAvgSamplesPerSec=6.339926640281542, CurrSamplesPerSec=5.708136128464922, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:44:56,496] [INFO] [timer.py:197:stop] 0/694, RunningAvgSamplesPerSec=6.339975244187676, CurrSamplesPerSec=5.722016639575232, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:45:07,847] [INFO] [timer.py:197:stop] 0/696, RunningAvgSamplesPerSec=6.339976987812794, CurrSamplesPerSec=5.701426102795623, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:45:19,156] [INFO] [timer.py:197:stop] 0/698, RunningAvgSamplesPerSec=6.340008928018111, CurrSamplesPerSec=5.707072548359031, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:45:30,483] [INFO] [logging.py:68:log_dist] [Rank 0] step=350, skipped=5, lr=[9.402917005361869e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:45:30,485] [INFO] [timer.py:197:stop] 0/700, RunningAvgSamplesPerSec=6.33998969805123, CurrSamplesPerSec=5.688489611358794, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0679, 'learning_rate': 9.402917005361869e-06, 'epoch': 1.48} |
|
[2022-12-16 13:45:41,855] [INFO] [timer.py:197:stop] 0/702, RunningAvgSamplesPerSec=6.339953341047152, CurrSamplesPerSec=5.691112944766182, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:45:53,172] [INFO] [timer.py:197:stop] 0/704, RunningAvgSamplesPerSec=6.339972100121671, CurrSamplesPerSec=5.700012061025391, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:46:04,520] [INFO] [timer.py:197:stop] 0/706, RunningAvgSamplesPerSec=6.339948693119949, CurrSamplesPerSec=5.700656525255543, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:46:15,841] [INFO] [timer.py:197:stop] 0/708, RunningAvgSamplesPerSec=6.339976446041088, CurrSamplesPerSec=5.715632950845321, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:46:27,186] [INFO] [timer.py:197:stop] 0/710, RunningAvgSamplesPerSec=6.339961938542896, CurrSamplesPerSec=5.688084845523418, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:46:38,529] [INFO] [timer.py:197:stop] 0/712, RunningAvgSamplesPerSec=6.339936097781599, CurrSamplesPerSec=5.675824342653396, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:46:49,905] [INFO] [timer.py:197:stop] 0/714, RunningAvgSamplesPerSec=6.339865841099534, CurrSamplesPerSec=5.675659933496987, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:47:01,280] [INFO] [timer.py:197:stop] 0/716, RunningAvgSamplesPerSec=6.339805621469403, CurrSamplesPerSec=5.6676813516656495, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:47:12,616] [INFO] [timer.py:197:stop] 0/718, RunningAvgSamplesPerSec=6.339809820815214, CurrSamplesPerSec=5.699940409091249, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:47:23,994] [INFO] [logging.py:68:log_dist] [Rank 0] step=360, skipped=5, lr=[9.44889475969735e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:47:23,996] [INFO] [timer.py:197:stop] 0/720, RunningAvgSamplesPerSec=6.3397379119814845, CurrSamplesPerSec=5.660866675560336, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:47:35,325] [INFO] [timer.py:197:stop] 0/722, RunningAvgSamplesPerSec=6.339759354188557, CurrSamplesPerSec=5.711746566085115, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:47:46,687] [INFO] [timer.py:197:stop] 0/724, RunningAvgSamplesPerSec=6.339718324350815, CurrSamplesPerSec=5.6913977108507305, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:47:58,043] [INFO] [timer.py:197:stop] 0/726, RunningAvgSamplesPerSec=6.3396828195960975, CurrSamplesPerSec=5.696925883021481, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:48:09,400] [INFO] [timer.py:197:stop] 0/728, RunningAvgSamplesPerSec=6.339658217967878, CurrSamplesPerSec=5.686045499809826, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:48:20,741] [INFO] [timer.py:197:stop] 0/730, RunningAvgSamplesPerSec=6.339619498024999, CurrSamplesPerSec=5.6979312576386025, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:48:32,064] [INFO] [timer.py:197:stop] 0/732, RunningAvgSamplesPerSec=6.339645849883508, CurrSamplesPerSec=5.720338072503252, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:48:43,365] [INFO] [timer.py:197:stop] 0/734, RunningAvgSamplesPerSec=6.339677598600247, CurrSamplesPerSec=5.718338150034742, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:48:54,704] [INFO] [timer.py:197:stop] 0/736, RunningAvgSamplesPerSec=6.33964311122071, CurrSamplesPerSec=5.676220403816082, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:49:06,037] [INFO] [timer.py:197:stop] 0/738, RunningAvgSamplesPerSec=6.339608348646385, CurrSamplesPerSec=5.6891620983826305, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:49:17,398] [INFO] [logging.py:68:log_dist] [Rank 0] step=370, skipped=5, lr=[9.493595187571683e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:49:17,400] [INFO] [timer.py:197:stop] 0/740, RunningAvgSamplesPerSec=6.339572544644652, CurrSamplesPerSec=5.697713077890738, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:49:28,737] [INFO] [timer.py:197:stop] 0/742, RunningAvgSamplesPerSec=6.339578787219607, CurrSamplesPerSec=5.709940655091284, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:49:40,073] [INFO] [timer.py:197:stop] 0/744, RunningAvgSamplesPerSec=6.339529466412074, CurrSamplesPerSec=5.689445704571697, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:49:51,401] [INFO] [timer.py:197:stop] 0/746, RunningAvgSamplesPerSec=6.339517662796265, CurrSamplesPerSec=5.697881911754523, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:50:02,732] [INFO] [timer.py:197:stop] 0/748, RunningAvgSamplesPerSec=6.3395072908390535, CurrSamplesPerSec=5.7089184113183284, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:50:14,073] [INFO] [timer.py:197:stop] 0/750, RunningAvgSamplesPerSec=6.33949993468579, CurrSamplesPerSec=5.695194578076193, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0671, 'learning_rate': 9.51548820454122e-06, 'epoch': 1.59} |
|
[2022-12-16 13:50:25,431] [INFO] [timer.py:197:stop] 0/752, RunningAvgSamplesPerSec=6.339481347956785, CurrSamplesPerSec=5.709314976098911, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:50:36,763] [INFO] [timer.py:197:stop] 0/754, RunningAvgSamplesPerSec=6.3395013841517045, CurrSamplesPerSec=5.719909016876233, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:50:48,093] [INFO] [timer.py:197:stop] 0/756, RunningAvgSamplesPerSec=6.339493013298119, CurrSamplesPerSec=5.700768147125971, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:50:59,392] [INFO] [timer.py:197:stop] 0/758, RunningAvgSamplesPerSec=6.339503676973275, CurrSamplesPerSec=5.708950221892302, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:51:10,704] [INFO] [logging.py:68:log_dist] [Rank 0] step=380, skipped=5, lr=[9.53708734662638e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:51:10,706] [INFO] [timer.py:197:stop] 0/760, RunningAvgSamplesPerSec=6.3395140683285325, CurrSamplesPerSec=5.71419847388043, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:51:22,052] [INFO] [timer.py:197:stop] 0/762, RunningAvgSamplesPerSec=6.3395024927179175, CurrSamplesPerSec=5.709905432716866, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:51:33,437] [INFO] [timer.py:197:stop] 0/764, RunningAvgSamplesPerSec=6.339423427139638, CurrSamplesPerSec=5.66285024659257, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:51:44,783] [INFO] [timer.py:197:stop] 0/766, RunningAvgSamplesPerSec=6.339410511132895, CurrSamplesPerSec=5.694031457786667, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:51:56,141] [INFO] [timer.py:197:stop] 0/768, RunningAvgSamplesPerSec=6.3393775231254565, CurrSamplesPerSec=5.6868733031141225, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:52:07,511] [INFO] [timer.py:197:stop] 0/770, RunningAvgSamplesPerSec=6.339326280601822, CurrSamplesPerSec=5.685077063917621, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:52:18,846] [INFO] [timer.py:197:stop] 0/772, RunningAvgSamplesPerSec=6.33933460713198, CurrSamplesPerSec=5.72251066678781, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:52:30,187] [INFO] [timer.py:197:stop] 0/774, RunningAvgSamplesPerSec=6.339317582023612, CurrSamplesPerSec=5.691922189402574, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:52:41,541] [INFO] [timer.py:197:stop] 0/776, RunningAvgSamplesPerSec=6.3393378217346354, CurrSamplesPerSec=5.718710927392318, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:52:52,900] [INFO] [timer.py:197:stop] 0/778, RunningAvgSamplesPerSec=6.339364210343123, CurrSamplesPerSec=5.7224716294350815, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:53:04,209] [INFO] [logging.py:68:log_dist] [Rank 0] step=390, skipped=5, lr=[9.57943484127219e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:53:04,211] [INFO] [timer.py:197:stop] 0/780, RunningAvgSamplesPerSec=6.339405370022461, CurrSamplesPerSec=5.728916094373603, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:53:15,543] [INFO] [timer.py:197:stop] 0/782, RunningAvgSamplesPerSec=6.339414951131689, CurrSamplesPerSec=5.707265720579158, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:53:26,878] [INFO] [timer.py:197:stop] 0/784, RunningAvgSamplesPerSec=6.339417908910616, CurrSamplesPerSec=5.702162457027009, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:53:38,239] [INFO] [timer.py:197:stop] 0/786, RunningAvgSamplesPerSec=6.3393991975344575, CurrSamplesPerSec=5.719818094602671, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:53:49,623] [INFO] [timer.py:197:stop] 0/788, RunningAvgSamplesPerSec=6.339324809483852, CurrSamplesPerSec=5.669576298657823, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:54:00,943] [INFO] [timer.py:197:stop] 0/790, RunningAvgSamplesPerSec=6.339353765970902, CurrSamplesPerSec=5.719103005166369, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:54:12,273] [INFO] [timer.py:197:stop] 0/792, RunningAvgSamplesPerSec=6.339365994979452, CurrSamplesPerSec=5.6981792094536186, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:54:23,570] [INFO] [timer.py:197:stop] 0/794, RunningAvgSamplesPerSec=6.339428107509493, CurrSamplesPerSec=5.714707454353629, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:54:34,890] [INFO] [timer.py:197:stop] 0/796, RunningAvgSamplesPerSec=6.33942946769859, CurrSamplesPerSec=5.700301591915373, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:54:46,228] [INFO] [timer.py:197:stop] 0/798, RunningAvgSamplesPerSec=6.339429936314821, CurrSamplesPerSec=5.698297024366172, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:54:57,531] [INFO] [logging.py:68:log_dist] [Rank 0] step=400, skipped=5, lr=[9.620696382156558e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:54:57,533] [INFO] [timer.py:197:stop] 0/800, RunningAvgSamplesPerSec=6.339431347765237, CurrSamplesPerSec=5.699998505115901, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0739, 'learning_rate': 9.620696382156558e-06, 'epoch': 1.69} |
|
[2022-12-16 13:55:08,867] [INFO] [timer.py:197:stop] 0/802, RunningAvgSamplesPerSec=6.339439738546504, CurrSamplesPerSec=5.716013652228299, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:55:20,218] [INFO] [timer.py:197:stop] 0/804, RunningAvgSamplesPerSec=6.339440725751527, CurrSamplesPerSec=5.691773018155927, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:55:31,550] [INFO] [timer.py:197:stop] 0/806, RunningAvgSamplesPerSec=6.33944356531772, CurrSamplesPerSec=5.699770000917279, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:55:42,902] [INFO] [timer.py:197:stop] 0/808, RunningAvgSamplesPerSec=6.339425456518778, CurrSamplesPerSec=5.682000057235838, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:55:54,240] [INFO] [timer.py:197:stop] 0/810, RunningAvgSamplesPerSec=6.339427784324849, CurrSamplesPerSec=5.713082537732757, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:56:05,570] [INFO] [timer.py:197:stop] 0/812, RunningAvgSamplesPerSec=6.3394158974920405, CurrSamplesPerSec=5.694748264487391, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:56:16,915] [INFO] [timer.py:197:stop] 0/814, RunningAvgSamplesPerSec=6.339398309772514, CurrSamplesPerSec=5.6812792415486575, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:56:28,242] [INFO] [timer.py:197:stop] 0/816, RunningAvgSamplesPerSec=6.339398027966287, CurrSamplesPerSec=5.7140795139842355, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:56:39,574] [INFO] [timer.py:197:stop] 0/818, RunningAvgSamplesPerSec=6.339407071520012, CurrSamplesPerSec=5.71556626029901, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:56:50,917] [INFO] [logging.py:68:log_dist] [Rank 0] step=410, skipped=5, lr=[9.660926275674324e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:56:50,919] [INFO] [timer.py:197:stop] 0/820, RunningAvgSamplesPerSec=6.339398405559651, CurrSamplesPerSec=5.700964767211974, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:57:02,288] [INFO] [timer.py:197:stop] 0/822, RunningAvgSamplesPerSec=6.339328253275692, CurrSamplesPerSec=5.679953533791154, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:57:13,645] [INFO] [timer.py:197:stop] 0/824, RunningAvgSamplesPerSec=6.3393217920803755, CurrSamplesPerSec=5.70598171053535, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:57:24,985] [INFO] [timer.py:197:stop] 0/826, RunningAvgSamplesPerSec=6.339315280868552, CurrSamplesPerSec=5.702338095912809, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:57:36,365] [INFO] [timer.py:197:stop] 0/828, RunningAvgSamplesPerSec=6.339260275276137, CurrSamplesPerSec=5.690148330458817, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:57:47,721] [INFO] [timer.py:197:stop] 0/830, RunningAvgSamplesPerSec=6.33923096134003, CurrSamplesPerSec=5.700336453805767, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:57:59,082] [INFO] [timer.py:197:stop] 0/832, RunningAvgSamplesPerSec=6.339196943996223, CurrSamplesPerSec=5.693846426785746, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:58:10,433] [INFO] [timer.py:197:stop] 0/834, RunningAvgSamplesPerSec=6.339177669502994, CurrSamplesPerSec=5.695319036406316, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:58:21,779] [INFO] [timer.py:197:stop] 0/836, RunningAvgSamplesPerSec=6.339162945202363, CurrSamplesPerSec=5.700884374109222, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:58:33,178] [INFO] [timer.py:197:stop] 0/838, RunningAvgSamplesPerSec=6.339079736344104, CurrSamplesPerSec=5.647579597527192, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:58:44,466] [INFO] [logging.py:68:log_dist] [Rank 0] step=420, skipped=5, lr=[9.700174853763023e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 13:58:44,468] [INFO] [timer.py:197:stop] 0/840, RunningAvgSamplesPerSec=6.33911936727273, CurrSamplesPerSec=5.727943757144047, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:58:55,801] [INFO] [timer.py:197:stop] 0/842, RunningAvgSamplesPerSec=6.339122279687559, CurrSamplesPerSec=5.704047081836697, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:59:07,134] [INFO] [timer.py:197:stop] 0/844, RunningAvgSamplesPerSec=6.33912318026892, CurrSamplesPerSec=5.696926850254961, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:59:18,493] [INFO] [timer.py:197:stop] 0/846, RunningAvgSamplesPerSec=6.339085626899937, CurrSamplesPerSec=5.686659583431364, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:59:29,826] [INFO] [timer.py:197:stop] 0/848, RunningAvgSamplesPerSec=6.339094325820955, CurrSamplesPerSec=5.702049811574694, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 13:59:41,137] [INFO] [timer.py:197:stop] 0/850, RunningAvgSamplesPerSec=6.339128153912398, CurrSamplesPerSec=5.726174495728036, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0675, 'learning_rate': 9.719445885591654e-06, 'epoch': 1.8} |
|
[2022-12-16 13:59:52,450] [INFO] [timer.py:197:stop] 0/852, RunningAvgSamplesPerSec=6.339135535487173, CurrSamplesPerSec=5.704070111186627, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:00:03,799] [INFO] [timer.py:197:stop] 0/854, RunningAvgSamplesPerSec=6.339113121822704, CurrSamplesPerSec=5.703067658172513, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:00:15,141] [INFO] [timer.py:197:stop] 0/856, RunningAvgSamplesPerSec=6.339054028633934, CurrSamplesPerSec=5.676935614246414, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:00:26,484] [INFO] [timer.py:197:stop] 0/858, RunningAvgSamplesPerSec=6.339043425322181, CurrSamplesPerSec=5.724982083002299, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:00:37,788] [INFO] [logging.py:68:log_dist] [Rank 0] step=430, skipped=5, lr=[9.738488852516646e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:00:37,790] [INFO] [timer.py:197:stop] 0/860, RunningAvgSamplesPerSec=6.339088069486757, CurrSamplesPerSec=5.7306706569410135, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:00:49,138] [INFO] [timer.py:197:stop] 0/862, RunningAvgSamplesPerSec=6.3390868336999295, CurrSamplesPerSec=5.698970864441769, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:01:00,453] [INFO] [timer.py:197:stop] 0/864, RunningAvgSamplesPerSec=6.339104410312392, CurrSamplesPerSec=5.71063547488767, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:01:11,787] [INFO] [timer.py:197:stop] 0/866, RunningAvgSamplesPerSec=6.339121311704636, CurrSamplesPerSec=5.716024363230836, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:01:23,132] [INFO] [timer.py:197:stop] 0/868, RunningAvgSamplesPerSec=6.339120228266978, CurrSamplesPerSec=5.683368354264379, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:01:34,468] [INFO] [timer.py:197:stop] 0/870, RunningAvgSamplesPerSec=6.33910997682536, CurrSamplesPerSec=5.710466856224587, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:01:45,820] [INFO] [timer.py:197:stop] 0/872, RunningAvgSamplesPerSec=6.339100845900049, CurrSamplesPerSec=5.703384401060349, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:01:57,207] [INFO] [timer.py:197:stop] 0/874, RunningAvgSamplesPerSec=6.339047018285702, CurrSamplesPerSec=5.665372737563986, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:02:08,567] [INFO] [timer.py:197:stop] 0/876, RunningAvgSamplesPerSec=6.33901200536423, CurrSamplesPerSec=5.6864747902976305, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:02:19,890] [INFO] [timer.py:197:stop] 0/878, RunningAvgSamplesPerSec=6.339029791739453, CurrSamplesPerSec=5.7194973302594745, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:02:31,207] [INFO] [logging.py:68:log_dist] [Rank 0] step=440, skipped=5, lr=[9.775911746761854e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:02:31,208] [INFO] [timer.py:197:stop] 0/880, RunningAvgSamplesPerSec=6.33903170931319, CurrSamplesPerSec=5.708432797717439, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:02:42,490] [INFO] [timer.py:197:stop] 0/882, RunningAvgSamplesPerSec=6.339064910628587, CurrSamplesPerSec=5.726082885578239, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:02:53,825] [INFO] [timer.py:197:stop] 0/884, RunningAvgSamplesPerSec=6.3390667056076735, CurrSamplesPerSec=5.704602019706414, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:03:05,198] [INFO] [timer.py:197:stop] 0/886, RunningAvgSamplesPerSec=6.339014928405507, CurrSamplesPerSec=5.678717101004349, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:03:16,505] [INFO] [timer.py:197:stop] 0/888, RunningAvgSamplesPerSec=6.339056665813401, CurrSamplesPerSec=5.714859533511053, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:03:27,875] [INFO] [timer.py:197:stop] 0/890, RunningAvgSamplesPerSec=6.339010932760778, CurrSamplesPerSec=5.69104489471728, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:03:39,219] [INFO] [timer.py:197:stop] 0/892, RunningAvgSamplesPerSec=6.338978175113135, CurrSamplesPerSec=5.678695236969803, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:03:50,613] [INFO] [timer.py:197:stop] 0/894, RunningAvgSamplesPerSec=6.338863748832803, CurrSamplesPerSec=5.6349953324302575, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:04:01,935] [INFO] [timer.py:197:stop] 0/896, RunningAvgSamplesPerSec=6.338875518439516, CurrSamplesPerSec=5.706703470467459, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:04:13,303] [INFO] [timer.py:197:stop] 0/898, RunningAvgSamplesPerSec=6.338837763473147, CurrSamplesPerSec=5.676794430539827, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:04:24,632] [INFO] [logging.py:68:log_dist] [Rank 0] step=450, skipped=5, lr=[9.812484046603779e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:04:24,634] [INFO] [timer.py:197:stop] 0/900, RunningAvgSamplesPerSec=6.338848880013622, CurrSamplesPerSec=5.690834722040079, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0662, 'learning_rate': 9.812484046603779e-06, 'epoch': 1.91} |
|
[2022-12-16 14:04:35,954] [INFO] [timer.py:197:stop] 0/902, RunningAvgSamplesPerSec=6.338874271877185, CurrSamplesPerSec=5.710595141563633, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:04:47,356] [INFO] [timer.py:197:stop] 0/904, RunningAvgSamplesPerSec=6.338794422774433, CurrSamplesPerSec=5.671700431786738, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:04:58,670] [INFO] [timer.py:197:stop] 0/906, RunningAvgSamplesPerSec=6.338806125893732, CurrSamplesPerSec=5.714749549054262, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:05:09,998] [INFO] [timer.py:197:stop] 0/908, RunningAvgSamplesPerSec=6.338802150105241, CurrSamplesPerSec=5.70888028766234, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:05:21,343] [INFO] [timer.py:197:stop] 0/910, RunningAvgSamplesPerSec=6.338808628157282, CurrSamplesPerSec=5.69326556555695, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:05:32,694] [INFO] [timer.py:197:stop] 0/912, RunningAvgSamplesPerSec=6.338800685386852, CurrSamplesPerSec=5.699132028984794, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:05:44,031] [INFO] [timer.py:197:stop] 0/914, RunningAvgSamplesPerSec=6.338801852033477, CurrSamplesPerSec=5.7040834440531585, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:05:55,373] [INFO] [timer.py:197:stop] 0/916, RunningAvgSamplesPerSec=6.338794724643747, CurrSamplesPerSec=5.721220520424191, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:06:06,679] [INFO] [timer.py:197:stop] 0/918, RunningAvgSamplesPerSec=6.338835604451315, CurrSamplesPerSec=5.713366345020804, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:06:18,011] [INFO] [logging.py:68:log_dist] [Rank 0] step=460, skipped=5, lr=[9.84824356101363e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:06:18,013] [INFO] [timer.py:197:stop] 0/920, RunningAvgSamplesPerSec=6.3388391544457265, CurrSamplesPerSec=5.708192692317932, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:06:29,378] [INFO] [timer.py:197:stop] 0/922, RunningAvgSamplesPerSec=6.3387960574018845, CurrSamplesPerSec=5.702751919676429, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:06:40,716] [INFO] [timer.py:197:stop] 0/924, RunningAvgSamplesPerSec=6.338775821963682, CurrSamplesPerSec=5.704209745753348, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:06:52,042] [INFO] [timer.py:197:stop] 0/926, RunningAvgSamplesPerSec=6.338796058710522, CurrSamplesPerSec=5.7262526720404985, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:07:03,367] [INFO] [timer.py:197:stop] 0/928, RunningAvgSamplesPerSec=6.338807359023693, CurrSamplesPerSec=5.711466808399003, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:07:14,713] [INFO] [timer.py:197:stop] 0/930, RunningAvgSamplesPerSec=6.3387945089784, CurrSamplesPerSec=5.711466565354717, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:07:26,078] [INFO] [timer.py:197:stop] 0/932, RunningAvgSamplesPerSec=6.338758976723027, CurrSamplesPerSec=5.670853079407315, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:07:37,451] [INFO] [timer.py:197:stop] 0/934, RunningAvgSamplesPerSec=6.338711815063571, CurrSamplesPerSec=5.681176798107868, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:07:48,763] [INFO] [timer.py:197:stop] 0/936, RunningAvgSamplesPerSec=6.338725117276131, CurrSamplesPerSec=5.705479133818621, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:08:00,080] [INFO] [timer.py:197:stop] 0/938, RunningAvgSamplesPerSec=6.338733055113867, CurrSamplesPerSec=5.7039661169667735, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:08:11,453] [INFO] [logging.py:68:log_dist] [Rank 0] step=470, skipped=5, lr=[9.883225632758308e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:08:11,454] [INFO] [timer.py:197:stop] 0/940, RunningAvgSamplesPerSec=6.338727478621597, CurrSamplesPerSec=5.71444078788409, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:08:22,842] [INFO] [timer.py:197:stop] 0/942, RunningAvgSamplesPerSec=6.338678473769538, CurrSamplesPerSec=5.684653281316449, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:08:31,392] [INFO] [timer.py:197:stop] 0/944, RunningAvgSamplesPerSec=6.341940146686614, CurrSamplesPerSec=10.143572907692075, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:08:42,755] [INFO] [timer.py:197:stop] 0/946, RunningAvgSamplesPerSec=6.341880087981473, CurrSamplesPerSec=5.65531315172878, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:08:54,074] [INFO] [timer.py:197:stop] 0/948, RunningAvgSamplesPerSec=6.341899798124187, CurrSamplesPerSec=5.714895547040833, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:09:05,413] [INFO] [timer.py:197:stop] 0/950, RunningAvgSamplesPerSec=6.341895452676643, CurrSamplesPerSec=5.71196582193226, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0615, 'learning_rate': 9.900435550016748e-06, 'epoch': 2.01} |
|
[2022-12-16 14:09:16,729] [INFO] [timer.py:197:stop] 0/952, RunningAvgSamplesPerSec=6.341902533421902, CurrSamplesPerSec=5.701898413407802, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:09:28,062] [INFO] [timer.py:197:stop] 0/954, RunningAvgSamplesPerSec=6.341888653558534, CurrSamplesPerSec=5.6888196864128115, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:09:39,353] [INFO] [timer.py:197:stop] 0/956, RunningAvgSamplesPerSec=6.341922296957011, CurrSamplesPerSec=5.706562500651045, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:09:50,669] [INFO] [timer.py:197:stop] 0/958, RunningAvgSamplesPerSec=6.341944759675942, CurrSamplesPerSec=5.709868753326772, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:10:01,997] [INFO] [logging.py:68:log_dist] [Rank 0] step=480, skipped=5, lr=[9.917463348331534e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:10:01,999] [INFO] [timer.py:197:stop] 0/960, RunningAvgSamplesPerSec=6.341948353997388, CurrSamplesPerSec=5.681460811183431, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:10:13,331] [INFO] [timer.py:197:stop] 0/962, RunningAvgSamplesPerSec=6.341906485418308, CurrSamplesPerSec=5.683349823598064, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:10:24,663] [INFO] [timer.py:197:stop] 0/964, RunningAvgSamplesPerSec=6.34190638859447, CurrSamplesPerSec=5.716055766174184, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:10:36,011] [INFO] [timer.py:197:stop] 0/966, RunningAvgSamplesPerSec=6.341885312885797, CurrSamplesPerSec=5.682461215444921, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:10:47,359] [INFO] [timer.py:197:stop] 0/968, RunningAvgSamplesPerSec=6.341865407507428, CurrSamplesPerSec=5.686475753985557, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:10:58,703] [INFO] [timer.py:197:stop] 0/970, RunningAvgSamplesPerSec=6.341830751002844, CurrSamplesPerSec=5.699992211322698, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:11:09,981] [INFO] [timer.py:197:stop] 0/972, RunningAvgSamplesPerSec=6.3418804675108085, CurrSamplesPerSec=5.728405313345917, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:11:21,335] [INFO] [timer.py:197:stop] 0/974, RunningAvgSamplesPerSec=6.3418344045834445, CurrSamplesPerSec=5.673371435431231, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:11:32,674] [INFO] [timer.py:197:stop] 0/976, RunningAvgSamplesPerSec=6.341824861899911, CurrSamplesPerSec=5.70538988234719, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:11:44,001] [INFO] [timer.py:197:stop] 0/978, RunningAvgSamplesPerSec=6.341817990135774, CurrSamplesPerSec=5.707448712355747, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:11:55,348] [INFO] [logging.py:68:log_dist] [Rank 0] step=490, skipped=5, lr=[9.950987726012135e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:11:55,350] [INFO] [timer.py:197:stop] 0/980, RunningAvgSamplesPerSec=6.341799433135131, CurrSamplesPerSec=5.695586096423975, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:12:06,683] [INFO] [timer.py:197:stop] 0/982, RunningAvgSamplesPerSec=6.341779990663049, CurrSamplesPerSec=5.696961912690353, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:12:18,051] [INFO] [timer.py:197:stop] 0/984, RunningAvgSamplesPerSec=6.341719172449315, CurrSamplesPerSec=5.6837885755779025, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:12:29,381] [INFO] [timer.py:197:stop] 0/986, RunningAvgSamplesPerSec=6.3417043868592184, CurrSamplesPerSec=5.705059579181187, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:12:40,734] [INFO] [timer.py:197:stop] 0/988, RunningAvgSamplesPerSec=6.341692069686432, CurrSamplesPerSec=5.710390568055206, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:12:52,100] [INFO] [timer.py:197:stop] 0/990, RunningAvgSamplesPerSec=6.341653951670791, CurrSamplesPerSec=5.689756353319079, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:13:03,398] [INFO] [timer.py:197:stop] 0/992, RunningAvgSamplesPerSec=6.341680481354011, CurrSamplesPerSec=5.729182646250193, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:13:14,687] [INFO] [timer.py:197:stop] 0/994, RunningAvgSamplesPerSec=6.341738585222543, CurrSamplesPerSec=5.734979308517798, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:13:26,027] [INFO] [timer.py:197:stop] 0/996, RunningAvgSamplesPerSec=6.341711829161802, CurrSamplesPerSec=5.68931595605461, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:13:37,337] [INFO] [timer.py:197:stop] 0/998, RunningAvgSamplesPerSec=6.341744360489826, CurrSamplesPerSec=5.727179957039876, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:13:48,636] [INFO] [logging.py:68:log_dist] [Rank 0] step=500, skipped=5, lr=[9.98382788472848e-06], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:13:48,638] [INFO] [timer.py:197:stop] 0/1000, RunningAvgSamplesPerSec=6.341788114796591, CurrSamplesPerSec=5.73382854195397, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0323, 'learning_rate': 9.98382788472848e-06, 'epoch': 2.12} |
|
[2022-12-16 14:13:59,943] [INFO] [timer.py:197:stop] 0/1002, RunningAvgSamplesPerSec=6.341826419288528, CurrSamplesPerSec=5.737755101866328, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:14:11,272] [INFO] [timer.py:197:stop] 0/1004, RunningAvgSamplesPerSec=6.341834083046816, CurrSamplesPerSec=5.716180164169838, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:14:22,617] [INFO] [timer.py:197:stop] 0/1006, RunningAvgSamplesPerSec=6.341825232645641, CurrSamplesPerSec=5.695088490706839, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:14:33,917] [INFO] [timer.py:197:stop] 0/1008, RunningAvgSamplesPerSec=6.341866326025775, CurrSamplesPerSec=5.733372724061965, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:14:45,219] [INFO] [timer.py:197:stop] 0/1010, RunningAvgSamplesPerSec=6.34190907891135, CurrSamplesPerSec=5.7319802464981136, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:14:56,525] [INFO] [timer.py:197:stop] 0/1012, RunningAvgSamplesPerSec=6.341945427652831, CurrSamplesPerSec=5.736810167567531, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:15:07,839] [INFO] [timer.py:197:stop] 0/1014, RunningAvgSamplesPerSec=6.341972683683957, CurrSamplesPerSec=5.723454312825036, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:15:19,158] [INFO] [timer.py:197:stop] 0/1016, RunningAvgSamplesPerSec=6.341996054053865, CurrSamplesPerSec=5.714832523661583, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:15:30,481] [INFO] [timer.py:197:stop] 0/1018, RunningAvgSamplesPerSec=6.342007970755142, CurrSamplesPerSec=5.711178572411817, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:15:41,794] [INFO] [logging.py:68:log_dist] [Rank 0] step=510, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:15:41,795] [INFO] [timer.py:197:stop] 0/1020, RunningAvgSamplesPerSec=6.342031963556685, CurrSamplesPerSec=5.733811640372937, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:15:53,096] [INFO] [timer.py:197:stop] 0/1022, RunningAvgSamplesPerSec=6.342077485529769, CurrSamplesPerSec=5.72531347603008, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:16:04,384] [INFO] [timer.py:197:stop] 0/1024, RunningAvgSamplesPerSec=6.3421136079117195, CurrSamplesPerSec=5.729963125598802, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:16:15,695] [INFO] [timer.py:197:stop] 0/1026, RunningAvgSamplesPerSec=6.342141516295768, CurrSamplesPerSec=5.7137732576155456, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:16:26,993] [INFO] [timer.py:197:stop] 0/1028, RunningAvgSamplesPerSec=6.342184555428346, CurrSamplesPerSec=5.73141874572896, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:16:38,254] [INFO] [timer.py:197:stop] 0/1030, RunningAvgSamplesPerSec=6.342268748281073, CurrSamplesPerSec=5.748393992222705, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:16:49,595] [INFO] [timer.py:197:stop] 0/1032, RunningAvgSamplesPerSec=6.342256928534129, CurrSamplesPerSec=5.707608172317402, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:17:00,885] [INFO] [timer.py:197:stop] 0/1034, RunningAvgSamplesPerSec=6.34230569518051, CurrSamplesPerSec=5.729498138778034, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:17:12,176] [INFO] [timer.py:197:stop] 0/1036, RunningAvgSamplesPerSec=6.342340358339657, CurrSamplesPerSec=5.728463991073313, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:17:23,508] [INFO] [timer.py:197:stop] 0/1038, RunningAvgSamplesPerSec=6.342352635604062, CurrSamplesPerSec=5.71814934344548, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:17:34,822] [INFO] [logging.py:68:log_dist] [Rank 0] step=520, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:17:34,823] [INFO] [timer.py:197:stop] 0/1040, RunningAvgSamplesPerSec=6.342354697339167, CurrSamplesPerSec=5.694759137565851, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:17:46,162] [INFO] [timer.py:197:stop] 0/1042, RunningAvgSamplesPerSec=6.342361815270279, CurrSamplesPerSec=5.706376654359612, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:17:57,487] [INFO] [timer.py:197:stop] 0/1044, RunningAvgSamplesPerSec=6.34235592630326, CurrSamplesPerSec=5.700416105141542, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:18:08,833] [INFO] [timer.py:197:stop] 0/1046, RunningAvgSamplesPerSec=6.342342159138999, CurrSamplesPerSec=5.691955741941306, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:18:20,144] [INFO] [timer.py:197:stop] 0/1048, RunningAvgSamplesPerSec=6.342370600740833, CurrSamplesPerSec=5.702918629122964, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:18:31,465] [INFO] [timer.py:197:stop] 0/1050, RunningAvgSamplesPerSec=6.342387230037039, CurrSamplesPerSec=5.715598631793174, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0333, 'learning_rate': 1e-05, 'epoch': 2.22} |
|
[2022-12-16 14:18:42,812] [INFO] [timer.py:197:stop] 0/1052, RunningAvgSamplesPerSec=6.3423771167743235, CurrSamplesPerSec=5.701767127426415, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:18:54,153] [INFO] [timer.py:197:stop] 0/1054, RunningAvgSamplesPerSec=6.342372589094107, CurrSamplesPerSec=5.7018460920706975, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:19:05,510] [INFO] [timer.py:197:stop] 0/1056, RunningAvgSamplesPerSec=6.342383222237946, CurrSamplesPerSec=5.714399670907678, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:19:16,839] [INFO] [timer.py:197:stop] 0/1058, RunningAvgSamplesPerSec=6.342371050847959, CurrSamplesPerSec=5.6924518346253254, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:19:28,246] [INFO] [logging.py:68:log_dist] [Rank 0] step=530, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:19:28,247] [INFO] [timer.py:197:stop] 0/1060, RunningAvgSamplesPerSec=6.342299448823461, CurrSamplesPerSec=5.650171271874156, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:19:39,585] [INFO] [timer.py:197:stop] 0/1062, RunningAvgSamplesPerSec=6.342313857990676, CurrSamplesPerSec=5.7169429810260475, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:19:51,015] [INFO] [timer.py:197:stop] 0/1064, RunningAvgSamplesPerSec=6.342204598759972, CurrSamplesPerSec=5.628198329627124, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:20:02,321] [INFO] [timer.py:197:stop] 0/1066, RunningAvgSamplesPerSec=6.342220668936828, CurrSamplesPerSec=5.71764389040324, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:20:13,641] [INFO] [timer.py:197:stop] 0/1068, RunningAvgSamplesPerSec=6.342239595971297, CurrSamplesPerSec=5.710870196981978, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:20:24,969] [INFO] [timer.py:197:stop] 0/1070, RunningAvgSamplesPerSec=6.342253991058788, CurrSamplesPerSec=5.718209516579019, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:20:36,325] [INFO] [timer.py:197:stop] 0/1072, RunningAvgSamplesPerSec=6.342224287357595, CurrSamplesPerSec=5.677394990102672, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:20:47,669] [INFO] [timer.py:197:stop] 0/1074, RunningAvgSamplesPerSec=6.342204821552335, CurrSamplesPerSec=5.681042376756076, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:20:58,990] [INFO] [timer.py:197:stop] 0/1076, RunningAvgSamplesPerSec=6.3422140360505805, CurrSamplesPerSec=5.7117669838789435, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:21:10,325] [INFO] [timer.py:197:stop] 0/1078, RunningAvgSamplesPerSec=6.342189336420485, CurrSamplesPerSec=5.700909557246923, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:21:21,654] [INFO] [logging.py:68:log_dist] [Rank 0] step=540, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:21:21,655] [INFO] [timer.py:197:stop] 0/1080, RunningAvgSamplesPerSec=6.3421901246825785, CurrSamplesPerSec=5.694489014269896, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:21:32,959] [INFO] [timer.py:197:stop] 0/1082, RunningAvgSamplesPerSec=6.342219345786515, CurrSamplesPerSec=5.711439344525709, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:21:44,279] [INFO] [timer.py:197:stop] 0/1084, RunningAvgSamplesPerSec=6.342212591177409, CurrSamplesPerSec=5.7033366571490465, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:21:55,574] [INFO] [timer.py:197:stop] 0/1086, RunningAvgSamplesPerSec=6.3422593216078065, CurrSamplesPerSec=5.724233232120478, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:22:06,888] [INFO] [timer.py:197:stop] 0/1088, RunningAvgSamplesPerSec=6.342280129547322, CurrSamplesPerSec=5.740310165045906, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:22:18,193] [INFO] [timer.py:197:stop] 0/1090, RunningAvgSamplesPerSec=6.342288921707194, CurrSamplesPerSec=5.716471339941928, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:22:29,514] [INFO] [timer.py:197:stop] 0/1092, RunningAvgSamplesPerSec=6.342279477932507, CurrSamplesPerSec=5.693720825125829, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:22:40,839] [INFO] [timer.py:197:stop] 0/1094, RunningAvgSamplesPerSec=6.342282469920605, CurrSamplesPerSec=5.709584078983662, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:22:52,177] [INFO] [timer.py:197:stop] 0/1096, RunningAvgSamplesPerSec=6.342254628403949, CurrSamplesPerSec=5.687472141105936, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:23:03,476] [INFO] [timer.py:197:stop] 0/1098, RunningAvgSamplesPerSec=6.34227071720925, CurrSamplesPerSec=5.719418363402194, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:23:14,765] [INFO] [logging.py:68:log_dist] [Rank 0] step=550, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:23:14,767] [INFO] [timer.py:197:stop] 0/1100, RunningAvgSamplesPerSec=6.342297649309376, CurrSamplesPerSec=5.725382103846932, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0353, 'learning_rate': 1e-05, 'epoch': 2.33} |
|
[2022-12-16 14:23:26,073] [INFO] [timer.py:197:stop] 0/1102, RunningAvgSamplesPerSec=6.3423095286973865, CurrSamplesPerSec=5.718222184768506, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:23:37,407] [INFO] [timer.py:197:stop] 0/1104, RunningAvgSamplesPerSec=6.342307992493998, CurrSamplesPerSec=5.7094963991746575, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:23:48,705] [INFO] [timer.py:197:stop] 0/1106, RunningAvgSamplesPerSec=6.342328757169193, CurrSamplesPerSec=5.738285215559371, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:24:00,071] [INFO] [timer.py:197:stop] 0/1108, RunningAvgSamplesPerSec=6.34232593711831, CurrSamplesPerSec=5.698098894893131, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:24:11,376] [INFO] [timer.py:197:stop] 0/1110, RunningAvgSamplesPerSec=6.342336434178085, CurrSamplesPerSec=5.706018582583476, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:24:22,695] [INFO] [timer.py:197:stop] 0/1112, RunningAvgSamplesPerSec=6.342357512645067, CurrSamplesPerSec=5.722553608490977, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:24:33,978] [INFO] [timer.py:197:stop] 0/1114, RunningAvgSamplesPerSec=6.342394223584167, CurrSamplesPerSec=5.7394996210805616, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:24:45,297] [INFO] [timer.py:197:stop] 0/1116, RunningAvgSamplesPerSec=6.342408601242247, CurrSamplesPerSec=5.7215861125861975, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:24:56,637] [INFO] [timer.py:197:stop] 0/1118, RunningAvgSamplesPerSec=6.342398456890344, CurrSamplesPerSec=5.690485835890004, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:25:07,926] [INFO] [logging.py:68:log_dist] [Rank 0] step=560, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:25:07,928] [INFO] [timer.py:197:stop] 0/1120, RunningAvgSamplesPerSec=6.342424392095902, CurrSamplesPerSec=5.707812789246779, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:25:19,264] [INFO] [timer.py:197:stop] 0/1122, RunningAvgSamplesPerSec=6.3424174848206185, CurrSamplesPerSec=5.709425237209748, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:25:30,609] [INFO] [timer.py:197:stop] 0/1124, RunningAvgSamplesPerSec=6.3424051767209475, CurrSamplesPerSec=5.706270392598644, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:25:41,981] [INFO] [timer.py:197:stop] 0/1126, RunningAvgSamplesPerSec=6.342360219547285, CurrSamplesPerSec=5.688850308903612, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:25:53,356] [INFO] [timer.py:197:stop] 0/1128, RunningAvgSamplesPerSec=6.342313010480555, CurrSamplesPerSec=5.661790099182979, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:26:04,709] [INFO] [timer.py:197:stop] 0/1130, RunningAvgSamplesPerSec=6.342280714904902, CurrSamplesPerSec=5.705290690373963, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:26:16,073] [INFO] [timer.py:197:stop] 0/1132, RunningAvgSamplesPerSec=6.342249829287662, CurrSamplesPerSec=5.680866604586072, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:26:27,424] [INFO] [timer.py:197:stop] 0/1134, RunningAvgSamplesPerSec=6.342245949095078, CurrSamplesPerSec=5.696907022034271, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:26:38,757] [INFO] [timer.py:197:stop] 0/1136, RunningAvgSamplesPerSec=6.342257240882108, CurrSamplesPerSec=5.7099586308153585, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:26:50,099] [INFO] [timer.py:197:stop] 0/1138, RunningAvgSamplesPerSec=6.342250841337713, CurrSamplesPerSec=5.710703994250225, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:27:01,418] [INFO] [logging.py:68:log_dist] [Rank 0] step=570, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:27:01,420] [INFO] [timer.py:197:stop] 0/1140, RunningAvgSamplesPerSec=6.342269594601758, CurrSamplesPerSec=5.701177141826042, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:27:12,760] [INFO] [timer.py:197:stop] 0/1142, RunningAvgSamplesPerSec=6.34225031713007, CurrSamplesPerSec=5.689825096173746, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:27:24,106] [INFO] [timer.py:197:stop] 0/1144, RunningAvgSamplesPerSec=6.342225176457225, CurrSamplesPerSec=5.6898470459965065, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:27:35,624] [INFO] [timer.py:197:stop] 0/1146, RunningAvgSamplesPerSec=6.3421971463894735, CurrSamplesPerSec=5.691119701597441, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:27:47,491] [INFO] [timer.py:197:stop] 0/1148, RunningAvgSamplesPerSec=6.3421702855318784, CurrSamplesPerSec=5.682071018284411, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:27:59,341] [INFO] [timer.py:197:stop] 0/1150, RunningAvgSamplesPerSec=6.3421815124282555, CurrSamplesPerSec=5.712243440512417, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0353, 'learning_rate': 1e-05, 'epoch': 2.44} |
|
[2022-12-16 14:28:11,032] [INFO] [timer.py:197:stop] 0/1152, RunningAvgSamplesPerSec=6.342142174405201, CurrSamplesPerSec=5.693715269795763, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:28:22,383] [INFO] [timer.py:197:stop] 0/1154, RunningAvgSamplesPerSec=6.342125125868255, CurrSamplesPerSec=5.702317503207661, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:28:33,748] [INFO] [timer.py:197:stop] 0/1156, RunningAvgSamplesPerSec=6.34210247247005, CurrSamplesPerSec=5.685200117179287, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:28:45,231] [INFO] [timer.py:197:stop] 0/1158, RunningAvgSamplesPerSec=6.34209466504466, CurrSamplesPerSec=5.7129103701889825, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:28:56,537] [INFO] [logging.py:68:log_dist] [Rank 0] step=580, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:28:56,539] [INFO] [timer.py:197:stop] 0/1160, RunningAvgSamplesPerSec=6.342087597571019, CurrSamplesPerSec=5.701697369069446, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:29:07,897] [INFO] [timer.py:197:stop] 0/1162, RunningAvgSamplesPerSec=6.342061366679014, CurrSamplesPerSec=5.694718061709522, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:29:19,229] [INFO] [timer.py:197:stop] 0/1164, RunningAvgSamplesPerSec=6.342063688900911, CurrSamplesPerSec=5.702904574752718, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:29:30,596] [INFO] [timer.py:197:stop] 0/1166, RunningAvgSamplesPerSec=6.342025215062624, CurrSamplesPerSec=5.688731919840502, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:29:41,953] [INFO] [timer.py:197:stop] 0/1168, RunningAvgSamplesPerSec=6.341998272912349, CurrSamplesPerSec=5.687137402836735, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:29:53,298] [INFO] [timer.py:197:stop] 0/1170, RunningAvgSamplesPerSec=6.341989796889964, CurrSamplesPerSec=5.698433957059607, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:30:04,664] [INFO] [timer.py:197:stop] 0/1172, RunningAvgSamplesPerSec=6.34195403052995, CurrSamplesPerSec=5.681861507890479, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:30:16,028] [INFO] [timer.py:197:stop] 0/1174, RunningAvgSamplesPerSec=6.341920898227156, CurrSamplesPerSec=5.6802061735601095, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:30:27,378] [INFO] [timer.py:197:stop] 0/1176, RunningAvgSamplesPerSec=6.341886173091101, CurrSamplesPerSec=5.679934544650027, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:30:38,734] [INFO] [timer.py:197:stop] 0/1178, RunningAvgSamplesPerSec=6.341867225778261, CurrSamplesPerSec=5.713497436055486, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:30:50,084] [INFO] [logging.py:68:log_dist] [Rank 0] step=590, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:30:50,086] [INFO] [timer.py:197:stop] 0/1180, RunningAvgSamplesPerSec=6.34185257294048, CurrSamplesPerSec=5.6980594641755085, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:31:01,423] [INFO] [timer.py:197:stop] 0/1182, RunningAvgSamplesPerSec=6.341854212156899, CurrSamplesPerSec=5.709254504302453, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:31:12,772] [INFO] [timer.py:197:stop] 0/1184, RunningAvgSamplesPerSec=6.341848295622397, CurrSamplesPerSec=5.699178250528177, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:31:24,084] [INFO] [timer.py:197:stop] 0/1186, RunningAvgSamplesPerSec=6.3418571902670635, CurrSamplesPerSec=5.715483994542817, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:31:35,424] [INFO] [timer.py:197:stop] 0/1188, RunningAvgSamplesPerSec=6.341833946588137, CurrSamplesPerSec=5.705600161334081, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:31:46,795] [INFO] [timer.py:197:stop] 0/1190, RunningAvgSamplesPerSec=6.34179553395068, CurrSamplesPerSec=5.6929719194984365, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:31:58,106] [INFO] [timer.py:197:stop] 0/1192, RunningAvgSamplesPerSec=6.341823510750819, CurrSamplesPerSec=5.711299841849379, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:32:09,451] [INFO] [timer.py:197:stop] 0/1194, RunningAvgSamplesPerSec=6.3418122267706485, CurrSamplesPerSec=5.715445296381856, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:32:20,735] [INFO] [timer.py:197:stop] 0/1196, RunningAvgSamplesPerSec=6.341849477491008, CurrSamplesPerSec=5.734557609331755, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:32:32,046] [INFO] [timer.py:197:stop] 0/1198, RunningAvgSamplesPerSec=6.341877244248112, CurrSamplesPerSec=5.71894119651206, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:32:43,387] [INFO] [logging.py:68:log_dist] [Rank 0] step=600, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:32:43,389] [INFO] [timer.py:197:stop] 0/1200, RunningAvgSamplesPerSec=6.341878607049019, CurrSamplesPerSec=5.696733168302691, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0347, 'learning_rate': 1e-05, 'epoch': 2.54} |
|
[2022-12-16 14:32:54,711] [INFO] [timer.py:197:stop] 0/1202, RunningAvgSamplesPerSec=6.341896471273075, CurrSamplesPerSec=5.717042091551064, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:33:06,065] [INFO] [timer.py:197:stop] 0/1204, RunningAvgSamplesPerSec=6.341897101506787, CurrSamplesPerSec=5.679245733300272, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:33:17,347] [INFO] [timer.py:197:stop] 0/1206, RunningAvgSamplesPerSec=6.341951670126494, CurrSamplesPerSec=5.735465773043581, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:33:28,700] [INFO] [timer.py:197:stop] 0/1208, RunningAvgSamplesPerSec=6.341933813423766, CurrSamplesPerSec=5.693047018526851, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:33:40,014] [INFO] [timer.py:197:stop] 0/1210, RunningAvgSamplesPerSec=6.341962570506191, CurrSamplesPerSec=5.726071648268367, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:33:51,341] [INFO] [timer.py:197:stop] 0/1212, RunningAvgSamplesPerSec=6.34198104703568, CurrSamplesPerSec=5.713993885386633, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:34:02,655] [INFO] [timer.py:197:stop] 0/1214, RunningAvgSamplesPerSec=6.341989706884567, CurrSamplesPerSec=5.719417875959229, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:34:13,996] [INFO] [timer.py:197:stop] 0/1216, RunningAvgSamplesPerSec=6.341986548452563, CurrSamplesPerSec=5.713432497815178, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:34:25,329] [INFO] [timer.py:197:stop] 0/1218, RunningAvgSamplesPerSec=6.341989702811427, CurrSamplesPerSec=5.704049748383485, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:34:36,668] [INFO] [logging.py:68:log_dist] [Rank 0] step=610, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:34:36,669] [INFO] [timer.py:197:stop] 0/1220, RunningAvgSamplesPerSec=6.341983907936107, CurrSamplesPerSec=5.704984162896505, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:34:47,992] [INFO] [timer.py:197:stop] 0/1222, RunningAvgSamplesPerSec=6.3420001667946035, CurrSamplesPerSec=5.7295521917594305, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:34:59,295] [INFO] [timer.py:197:stop] 0/1224, RunningAvgSamplesPerSec=6.34203665776226, CurrSamplesPerSec=5.742030444913072, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:35:10,632] [INFO] [timer.py:197:stop] 0/1226, RunningAvgSamplesPerSec=6.342035936585133, CurrSamplesPerSec=5.711309320038435, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:35:21,990] [INFO] [timer.py:197:stop] 0/1228, RunningAvgSamplesPerSec=6.342023840723309, CurrSamplesPerSec=5.703659731799368, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:35:33,332] [INFO] [timer.py:197:stop] 0/1230, RunningAvgSamplesPerSec=6.34202606042488, CurrSamplesPerSec=5.719129568004103, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:35:44,666] [INFO] [timer.py:197:stop] 0/1232, RunningAvgSamplesPerSec=6.342021991006609, CurrSamplesPerSec=5.695910227336529, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:35:56,016] [INFO] [timer.py:197:stop] 0/1234, RunningAvgSamplesPerSec=6.342006012194455, CurrSamplesPerSec=5.683263669619541, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:36:07,345] [INFO] [timer.py:197:stop] 0/1236, RunningAvgSamplesPerSec=6.3419942199098385, CurrSamplesPerSec=5.70789798994556, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:36:18,750] [INFO] [timer.py:197:stop] 0/1238, RunningAvgSamplesPerSec=6.341966640691746, CurrSamplesPerSec=5.686711144545196, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:36:30,081] [INFO] [logging.py:68:log_dist] [Rank 0] step=620, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:36:30,083] [INFO] [timer.py:197:stop] 0/1240, RunningAvgSamplesPerSec=6.3419657033705965, CurrSamplesPerSec=5.701581593341636, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:36:41,393] [INFO] [timer.py:197:stop] 0/1242, RunningAvgSamplesPerSec=6.341980020428165, CurrSamplesPerSec=5.731832395172248, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:36:52,764] [INFO] [timer.py:197:stop] 0/1244, RunningAvgSamplesPerSec=6.341953373133708, CurrSamplesPerSec=5.680820439143911, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:37:04,058] [INFO] [timer.py:197:stop] 0/1246, RunningAvgSamplesPerSec=6.34197841556084, CurrSamplesPerSec=5.717911830283779, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:37:15,402] [INFO] [timer.py:197:stop] 0/1248, RunningAvgSamplesPerSec=6.341969066741227, CurrSamplesPerSec=5.705706640688583, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:37:26,914] [INFO] [timer.py:197:stop] 0/1250, RunningAvgSamplesPerSec=6.341964101473373, CurrSamplesPerSec=5.70826430910481, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0348, 'learning_rate': 1e-05, 'epoch': 2.65} |
|
[2022-12-16 14:37:38,762] [INFO] [timer.py:197:stop] 0/1252, RunningAvgSamplesPerSec=6.341913431600818, CurrSamplesPerSec=5.674062660433857, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:37:50,589] [INFO] [timer.py:197:stop] 0/1254, RunningAvgSamplesPerSec=6.341901815390098, CurrSamplesPerSec=5.694548448860545, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:38:02,393] [INFO] [timer.py:197:stop] 0/1256, RunningAvgSamplesPerSec=6.341883576631976, CurrSamplesPerSec=5.688104612379961, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:38:14,000] [INFO] [timer.py:197:stop] 0/1258, RunningAvgSamplesPerSec=6.341872794684727, CurrSamplesPerSec=5.703657065617219, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:38:25,393] [INFO] [logging.py:68:log_dist] [Rank 0] step=630, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:38:25,394] [INFO] [timer.py:197:stop] 0/1260, RunningAvgSamplesPerSec=6.341865366147259, CurrSamplesPerSec=5.705734049457164, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:38:36,741] [INFO] [timer.py:197:stop] 0/1262, RunningAvgSamplesPerSec=6.341855550405924, CurrSamplesPerSec=5.691122838703123, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:38:48,184] [INFO] [timer.py:197:stop] 0/1264, RunningAvgSamplesPerSec=6.341869424864738, CurrSamplesPerSec=5.712905750016515, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:38:59,484] [INFO] [timer.py:197:stop] 0/1266, RunningAvgSamplesPerSec=6.341905958246278, CurrSamplesPerSec=5.743291164373362, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:39:10,797] [INFO] [timer.py:197:stop] 0/1268, RunningAvgSamplesPerSec=6.341933834782191, CurrSamplesPerSec=5.732381980420735, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:39:22,092] [INFO] [timer.py:197:stop] 0/1270, RunningAvgSamplesPerSec=6.341976379604057, CurrSamplesPerSec=5.735517732822823, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:39:33,396] [INFO] [timer.py:197:stop] 0/1272, RunningAvgSamplesPerSec=6.341977077258627, CurrSamplesPerSec=5.720831078371896, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:39:44,729] [INFO] [timer.py:197:stop] 0/1274, RunningAvgSamplesPerSec=6.3419649563084874, CurrSamplesPerSec=5.691029933577238, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:39:56,055] [INFO] [timer.py:197:stop] 0/1276, RunningAvgSamplesPerSec=6.341979825791403, CurrSamplesPerSec=5.709503199728328, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:40:07,411] [INFO] [timer.py:197:stop] 0/1278, RunningAvgSamplesPerSec=6.341966728201985, CurrSamplesPerSec=5.696792166154243, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:40:18,742] [INFO] [logging.py:68:log_dist] [Rank 0] step=640, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:40:18,743] [INFO] [timer.py:197:stop] 0/1280, RunningAvgSamplesPerSec=6.341970256692958, CurrSamplesPerSec=5.720286143597552, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:40:30,044] [INFO] [timer.py:197:stop] 0/1282, RunningAvgSamplesPerSec=6.342008220842321, CurrSamplesPerSec=5.737926071842598, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:40:41,373] [INFO] [timer.py:197:stop] 0/1284, RunningAvgSamplesPerSec=6.3419988418237585, CurrSamplesPerSec=5.7099489141936965, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:40:52,693] [INFO] [timer.py:197:stop] 0/1286, RunningAvgSamplesPerSec=6.342022034261011, CurrSamplesPerSec=5.724150472559608, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:41:04,024] [INFO] [timer.py:197:stop] 0/1288, RunningAvgSamplesPerSec=6.3420292694938984, CurrSamplesPerSec=5.6933339102268805, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:41:15,413] [INFO] [timer.py:197:stop] 0/1290, RunningAvgSamplesPerSec=6.341983459677655, CurrSamplesPerSec=5.637183596584856, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:41:26,783] [INFO] [timer.py:197:stop] 0/1292, RunningAvgSamplesPerSec=6.341963235738544, CurrSamplesPerSec=5.684480896928181, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:41:38,215] [INFO] [timer.py:197:stop] 0/1294, RunningAvgSamplesPerSec=6.34183759172952, CurrSamplesPerSec=5.602544304448964, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:41:49,551] [INFO] [timer.py:197:stop] 0/1296, RunningAvgSamplesPerSec=6.341831440484137, CurrSamplesPerSec=5.700941278631235, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:42:00,960] [INFO] [timer.py:197:stop] 0/1298, RunningAvgSamplesPerSec=6.341805815820332, CurrSamplesPerSec=5.6858438857398, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:42:12,912] [INFO] [logging.py:68:log_dist] [Rank 0] step=650, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:42:12,913] [INFO] [timer.py:197:stop] 0/1300, RunningAvgSamplesPerSec=6.341745443623, CurrSamplesPerSec=5.6577931560620875, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0364, 'learning_rate': 1e-05, 'epoch': 2.75} |
|
[2022-12-16 14:42:24,852] [INFO] [timer.py:197:stop] 0/1302, RunningAvgSamplesPerSec=6.341699753784161, CurrSamplesPerSec=5.693112701616461, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:42:36,739] [INFO] [timer.py:197:stop] 0/1304, RunningAvgSamplesPerSec=6.341670847975745, CurrSamplesPerSec=5.672541564369074, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:42:48,129] [INFO] [timer.py:197:stop] 0/1306, RunningAvgSamplesPerSec=6.341618362263155, CurrSamplesPerSec=5.645468028242119, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:42:59,447] [INFO] [timer.py:197:stop] 0/1308, RunningAvgSamplesPerSec=6.341632092950719, CurrSamplesPerSec=5.696219648453861, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:43:10,887] [INFO] [timer.py:197:stop] 0/1310, RunningAvgSamplesPerSec=6.341590601962275, CurrSamplesPerSec=5.6654717420843825, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:43:22,240] [INFO] [timer.py:197:stop] 0/1312, RunningAvgSamplesPerSec=6.341584614965287, CurrSamplesPerSec=5.689816171568963, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:43:33,568] [INFO] [timer.py:197:stop] 0/1314, RunningAvgSamplesPerSec=6.341594613430739, CurrSamplesPerSec=5.7093586914483065, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:43:44,956] [INFO] [timer.py:197:stop] 0/1316, RunningAvgSamplesPerSec=6.341563466477754, CurrSamplesPerSec=5.670641041289616, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:43:56,265] [INFO] [timer.py:197:stop] 0/1318, RunningAvgSamplesPerSec=6.341569001790131, CurrSamplesPerSec=5.7292819370508505, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:44:07,568] [INFO] [logging.py:68:log_dist] [Rank 0] step=660, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:44:07,570] [INFO] [timer.py:197:stop] 0/1320, RunningAvgSamplesPerSec=6.341599110928043, CurrSamplesPerSec=5.733176556079528, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:44:18,913] [INFO] [timer.py:197:stop] 0/1322, RunningAvgSamplesPerSec=6.341575713584571, CurrSamplesPerSec=5.6912922475459755, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:44:30,220] [INFO] [timer.py:197:stop] 0/1324, RunningAvgSamplesPerSec=6.341604782515842, CurrSamplesPerSec=5.733804781788765, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:44:41,563] [INFO] [timer.py:197:stop] 0/1326, RunningAvgSamplesPerSec=6.341584816278544, CurrSamplesPerSec=5.698715826869556, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:44:52,882] [INFO] [timer.py:197:stop] 0/1328, RunningAvgSamplesPerSec=6.3416033148122475, CurrSamplesPerSec=5.721217106174578, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:45:04,217] [INFO] [timer.py:197:stop] 0/1330, RunningAvgSamplesPerSec=6.341592936615098, CurrSamplesPerSec=5.684606813545637, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:45:15,593] [INFO] [timer.py:197:stop] 0/1332, RunningAvgSamplesPerSec=6.341554846742973, CurrSamplesPerSec=5.664314992225257, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:45:26,924] [INFO] [timer.py:197:stop] 0/1334, RunningAvgSamplesPerSec=6.341556591018235, CurrSamplesPerSec=5.699659628332756, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:45:38,272] [INFO] [timer.py:197:stop] 0/1336, RunningAvgSamplesPerSec=6.341547460152438, CurrSamplesPerSec=5.697217760627823, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:45:49,609] [INFO] [timer.py:197:stop] 0/1338, RunningAvgSamplesPerSec=6.3415444610859675, CurrSamplesPerSec=5.698977639944709, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:46:00,947] [INFO] [logging.py:68:log_dist] [Rank 0] step=670, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:46:00,949] [INFO] [timer.py:197:stop] 0/1340, RunningAvgSamplesPerSec=6.341538353802266, CurrSamplesPerSec=5.699747006276326, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:46:12,281] [INFO] [timer.py:197:stop] 0/1342, RunningAvgSamplesPerSec=6.341546638536307, CurrSamplesPerSec=5.7052620732819355, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:46:23,633] [INFO] [timer.py:197:stop] 0/1344, RunningAvgSamplesPerSec=6.341530953943566, CurrSamplesPerSec=5.711561597247238, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:46:34,962] [INFO] [timer.py:197:stop] 0/1346, RunningAvgSamplesPerSec=6.341535474785801, CurrSamplesPerSec=5.688151860544939, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:46:46,287] [INFO] [timer.py:197:stop] 0/1348, RunningAvgSamplesPerSec=6.341530994988656, CurrSamplesPerSec=5.706824549931542, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:46:57,648] [INFO] [timer.py:197:stop] 0/1350, RunningAvgSamplesPerSec=6.341510467487031, CurrSamplesPerSec=5.679943438282595, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0368, 'learning_rate': 1e-05, 'epoch': 2.86} |
|
[2022-12-16 14:47:08,981] [INFO] [timer.py:197:stop] 0/1352, RunningAvgSamplesPerSec=6.341513942550877, CurrSamplesPerSec=5.706243221276868, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:47:20,362] [INFO] [timer.py:197:stop] 0/1354, RunningAvgSamplesPerSec=6.341465545784576, CurrSamplesPerSec=5.655782857749175, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:47:31,746] [INFO] [timer.py:197:stop] 0/1356, RunningAvgSamplesPerSec=6.3414346374436645, CurrSamplesPerSec=5.672131394177037, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:47:43,085] [INFO] [timer.py:197:stop] 0/1358, RunningAvgSamplesPerSec=6.34142760831968, CurrSamplesPerSec=5.706950245070932, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:47:54,415] [INFO] [logging.py:68:log_dist] [Rank 0] step=680, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:47:54,416] [INFO] [timer.py:197:stop] 0/1360, RunningAvgSamplesPerSec=6.3414369763987555, CurrSamplesPerSec=5.711564270824477, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:48:05,784] [INFO] [timer.py:197:stop] 0/1362, RunningAvgSamplesPerSec=6.341397986420672, CurrSamplesPerSec=5.664015720082998, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:48:17,363] [INFO] [timer.py:197:stop] 0/1364, RunningAvgSamplesPerSec=6.341386306228154, CurrSamplesPerSec=5.696870025845178, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:48:29,358] [INFO] [timer.py:197:stop] 0/1366, RunningAvgSamplesPerSec=6.341305811961416, CurrSamplesPerSec=5.64646078457769, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:48:40,976] [INFO] [timer.py:197:stop] 0/1368, RunningAvgSamplesPerSec=6.341278132620111, CurrSamplesPerSec=5.6836075791694585, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:48:52,791] [INFO] [timer.py:197:stop] 0/1370, RunningAvgSamplesPerSec=6.341239448006165, CurrSamplesPerSec=5.673257046981256, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:49:04,402] [INFO] [timer.py:197:stop] 0/1372, RunningAvgSamplesPerSec=6.341231709995396, CurrSamplesPerSec=5.701490284002487, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:49:15,720] [INFO] [timer.py:197:stop] 0/1374, RunningAvgSamplesPerSec=6.3412303326014925, CurrSamplesPerSec=5.698467586413149, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:49:27,138] [INFO] [timer.py:197:stop] 0/1376, RunningAvgSamplesPerSec=6.341202115120962, CurrSamplesPerSec=5.696106754198011, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:49:38,516] [INFO] [timer.py:197:stop] 0/1378, RunningAvgSamplesPerSec=6.341153200835064, CurrSamplesPerSec=5.654166262600238, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:49:49,936] [INFO] [logging.py:68:log_dist] [Rank 0] step=690, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:49:49,938] [INFO] [timer.py:197:stop] 0/1380, RunningAvgSamplesPerSec=6.341132224930991, CurrSamplesPerSec=5.68624496005538, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:50:01,253] [INFO] [timer.py:197:stop] 0/1382, RunningAvgSamplesPerSec=6.34112280256516, CurrSamplesPerSec=5.695159054075012, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:50:12,589] [INFO] [timer.py:197:stop] 0/1384, RunningAvgSamplesPerSec=6.341123372534469, CurrSamplesPerSec=5.6996046856792315, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:50:23,995] [INFO] [timer.py:197:stop] 0/1386, RunningAvgSamplesPerSec=6.341059047870384, CurrSamplesPerSec=5.647039024132055, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:50:35,562] [INFO] [timer.py:197:stop] 0/1388, RunningAvgSamplesPerSec=6.341045577125367, CurrSamplesPerSec=5.692134856429986, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:50:46,957] [INFO] [timer.py:197:stop] 0/1390, RunningAvgSamplesPerSec=6.340997144458177, CurrSamplesPerSec=5.681990676077318, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:50:58,305] [INFO] [timer.py:197:stop] 0/1392, RunningAvgSamplesPerSec=6.3409874033737506, CurrSamplesPerSec=5.687776307523125, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:51:09,675] [INFO] [timer.py:197:stop] 0/1394, RunningAvgSamplesPerSec=6.340967599324559, CurrSamplesPerSec=5.682356083068968, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:51:21,023] [INFO] [timer.py:197:stop] 0/1396, RunningAvgSamplesPerSec=6.34094175660239, CurrSamplesPerSec=5.678554205979396, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:51:32,541] [INFO] [timer.py:197:stop] 0/1398, RunningAvgSamplesPerSec=6.340900730423903, CurrSamplesPerSec=5.660423338680644, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:51:44,143] [INFO] [logging.py:68:log_dist] [Rank 0] step=700, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:51:44,145] [INFO] [timer.py:197:stop] 0/1400, RunningAvgSamplesPerSec=6.3408752815569525, CurrSamplesPerSec=5.673991419538569, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0384, 'learning_rate': 1e-05, 'epoch': 2.97} |
|
[2022-12-16 14:51:55,483] [INFO] [timer.py:197:stop] 0/1402, RunningAvgSamplesPerSec=6.340859458400576, CurrSamplesPerSec=5.685104274845516, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:52:07,020] [INFO] [timer.py:197:stop] 0/1404, RunningAvgSamplesPerSec=6.3408292407124565, CurrSamplesPerSec=5.687765702124439, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:52:18,470] [INFO] [timer.py:197:stop] 0/1406, RunningAvgSamplesPerSec=6.340845264199506, CurrSamplesPerSec=5.7009642829093865, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:52:29,807] [INFO] [timer.py:197:stop] 0/1408, RunningAvgSamplesPerSec=6.3408450665175335, CurrSamplesPerSec=5.693712129831391, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:52:41,150] [INFO] [timer.py:197:stop] 0/1410, RunningAvgSamplesPerSec=6.340834831800001, CurrSamplesPerSec=5.713949855771624, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:52:52,490] [INFO] [timer.py:197:stop] 0/1412, RunningAvgSamplesPerSec=6.340832970670184, CurrSamplesPerSec=5.697333601041515, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:53:03,836] [INFO] [timer.py:197:stop] 0/1414, RunningAvgSamplesPerSec=6.340821690172965, CurrSamplesPerSec=5.699762255333288, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:53:12,376] [INFO] [timer.py:197:stop] 0/1416, RunningAvgSamplesPerSec=6.343001806368919, CurrSamplesPerSec=10.222363241398545, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:53:23,743] [INFO] [timer.py:197:stop] 0/1418, RunningAvgSamplesPerSec=6.342979836865651, CurrSamplesPerSec=5.6932759499770516, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:53:35,103] [INFO] [logging.py:68:log_dist] [Rank 0] step=710, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:53:35,104] [INFO] [timer.py:197:stop] 0/1420, RunningAvgSamplesPerSec=6.3429412114598085, CurrSamplesPerSec=5.680045356492783, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:53:46,496] [INFO] [timer.py:197:stop] 0/1422, RunningAvgSamplesPerSec=6.3429387545618505, CurrSamplesPerSec=5.716258554728885, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:53:57,827] [INFO] [timer.py:197:stop] 0/1424, RunningAvgSamplesPerSec=6.342909086580767, CurrSamplesPerSec=5.685790895167144, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:54:09,182] [INFO] [timer.py:197:stop] 0/1426, RunningAvgSamplesPerSec=6.342904424723496, CurrSamplesPerSec=5.694281969307971, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:54:20,544] [INFO] [timer.py:197:stop] 0/1428, RunningAvgSamplesPerSec=6.34289700017322, CurrSamplesPerSec=5.698439521601493, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:54:31,873] [INFO] [timer.py:197:stop] 0/1430, RunningAvgSamplesPerSec=6.342893310403501, CurrSamplesPerSec=5.709524815881511, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:54:43,237] [INFO] [timer.py:197:stop] 0/1432, RunningAvgSamplesPerSec=6.342840839202115, CurrSamplesPerSec=5.677154847083064, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:54:54,580] [INFO] [timer.py:197:stop] 0/1434, RunningAvgSamplesPerSec=6.342816101843901, CurrSamplesPerSec=5.694050782866752, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:55:05,948] [INFO] [timer.py:197:stop] 0/1436, RunningAvgSamplesPerSec=6.342783522446399, CurrSamplesPerSec=5.674854826198604, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:55:17,329] [INFO] [timer.py:197:stop] 0/1438, RunningAvgSamplesPerSec=6.342740813957726, CurrSamplesPerSec=5.663693535908116, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:55:28,880] [INFO] [logging.py:68:log_dist] [Rank 0] step=720, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:55:28,882] [INFO] [timer.py:197:stop] 0/1440, RunningAvgSamplesPerSec=6.342599648821451, CurrSamplesPerSec=5.694897591728672, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:55:40,282] [INFO] [timer.py:197:stop] 0/1442, RunningAvgSamplesPerSec=6.342540221445991, CurrSamplesPerSec=5.6591770197937095, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:55:51,670] [INFO] [timer.py:197:stop] 0/1444, RunningAvgSamplesPerSec=6.342520759184326, CurrSamplesPerSec=5.697625762296395, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:56:03,038] [INFO] [timer.py:197:stop] 0/1446, RunningAvgSamplesPerSec=6.3424922000532105, CurrSamplesPerSec=5.678319730434744, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:56:14,426] [INFO] [timer.py:197:stop] 0/1448, RunningAvgSamplesPerSec=6.342433462125882, CurrSamplesPerSec=5.656962351987449, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:56:25,856] [INFO] [timer.py:197:stop] 0/1450, RunningAvgSamplesPerSec=6.34238288396144, CurrSamplesPerSec=5.660801018168518, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0236, 'learning_rate': 1e-05, 'epoch': 3.07} |
|
[2022-12-16 14:56:37,239] [INFO] [timer.py:197:stop] 0/1452, RunningAvgSamplesPerSec=6.342361993677473, CurrSamplesPerSec=5.68302784248084, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:56:48,968] [INFO] [timer.py:197:stop] 0/1454, RunningAvgSamplesPerSec=6.342314594444791, CurrSamplesPerSec=5.657922424784969, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:57:00,697] [INFO] [timer.py:197:stop] 0/1456, RunningAvgSamplesPerSec=6.342268591669828, CurrSamplesPerSec=5.6867578876713685, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:57:12,340] [INFO] [timer.py:197:stop] 0/1458, RunningAvgSamplesPerSec=6.34223068656754, CurrSamplesPerSec=5.667312325582682, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:57:24,102] [INFO] [logging.py:68:log_dist] [Rank 0] step=730, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:57:24,103] [INFO] [timer.py:197:stop] 0/1460, RunningAvgSamplesPerSec=6.342212515723276, CurrSamplesPerSec=5.696579150982275, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:57:35,745] [INFO] [timer.py:197:stop] 0/1462, RunningAvgSamplesPerSec=6.342151666537463, CurrSamplesPerSec=5.655240951176899, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:57:47,382] [INFO] [timer.py:197:stop] 0/1464, RunningAvgSamplesPerSec=6.342130155753065, CurrSamplesPerSec=5.695470085375787, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:57:58,732] [INFO] [timer.py:197:stop] 0/1466, RunningAvgSamplesPerSec=6.342123397647371, CurrSamplesPerSec=5.701415204243638, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:58:10,167] [INFO] [timer.py:197:stop] 0/1468, RunningAvgSamplesPerSec=6.342083483163338, CurrSamplesPerSec=5.672903119485949, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:58:21,529] [INFO] [timer.py:197:stop] 0/1470, RunningAvgSamplesPerSec=6.342080925878628, CurrSamplesPerSec=5.703203123854703, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:58:32,862] [INFO] [timer.py:197:stop] 0/1472, RunningAvgSamplesPerSec=6.342080072093458, CurrSamplesPerSec=5.70864427228525, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:58:44,209] [INFO] [timer.py:197:stop] 0/1474, RunningAvgSamplesPerSec=6.34205363377033, CurrSamplesPerSec=5.676758175313035, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:58:55,673] [INFO] [timer.py:197:stop] 0/1476, RunningAvgSamplesPerSec=6.342038899550425, CurrSamplesPerSec=5.692143546907543, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:59:07,023] [INFO] [timer.py:197:stop] 0/1478, RunningAvgSamplesPerSec=6.342022666973257, CurrSamplesPerSec=5.692293944653019, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:59:18,376] [INFO] [logging.py:68:log_dist] [Rank 0] step=740, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 14:59:18,376] [INFO] [timer.py:197:stop] 0/1480, RunningAvgSamplesPerSec=6.342011608288597, CurrSamplesPerSec=5.692075713606984, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:59:29,773] [INFO] [timer.py:197:stop] 0/1482, RunningAvgSamplesPerSec=6.341969359985125, CurrSamplesPerSec=5.6732767109147755, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:59:41,274] [INFO] [timer.py:197:stop] 0/1484, RunningAvgSamplesPerSec=6.341953698000013, CurrSamplesPerSec=5.6926032144284955, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 14:59:52,679] [INFO] [timer.py:197:stop] 0/1486, RunningAvgSamplesPerSec=6.341940489551061, CurrSamplesPerSec=5.70655085457492, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:00:04,027] [INFO] [timer.py:197:stop] 0/1488, RunningAvgSamplesPerSec=6.341926388483119, CurrSamplesPerSec=5.696006192633686, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:00:15,606] [INFO] [timer.py:197:stop] 0/1490, RunningAvgSamplesPerSec=6.341889248128019, CurrSamplesPerSec=5.661455750178363, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:00:27,090] [INFO] [timer.py:197:stop] 0/1492, RunningAvgSamplesPerSec=6.341863503136832, CurrSamplesPerSec=5.680092470863853, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:00:38,422] [INFO] [timer.py:197:stop] 0/1494, RunningAvgSamplesPerSec=6.34186409141219, CurrSamplesPerSec=5.709879441312319, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:00:49,791] [INFO] [timer.py:197:stop] 0/1496, RunningAvgSamplesPerSec=6.341834904526626, CurrSamplesPerSec=5.687857054469082, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:01:01,198] [INFO] [timer.py:197:stop] 0/1498, RunningAvgSamplesPerSec=6.341831456889497, CurrSamplesPerSec=5.6952240609171, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:01:12,713] [INFO] [logging.py:68:log_dist] [Rank 0] step=750, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:01:12,714] [INFO] [timer.py:197:stop] 0/1500, RunningAvgSamplesPerSec=6.341677946328467, CurrSamplesPerSec=5.672521186308156, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0188, 'learning_rate': 1e-05, 'epoch': 3.18} |
|
[2022-12-16 15:01:24,132] [INFO] [timer.py:197:stop] 0/1502, RunningAvgSamplesPerSec=6.341638775258454, CurrSamplesPerSec=5.674385305739847, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:01:35,509] [INFO] [timer.py:197:stop] 0/1504, RunningAvgSamplesPerSec=6.341605500941506, CurrSamplesPerSec=5.663864183791158, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:01:46,957] [INFO] [timer.py:197:stop] 0/1506, RunningAvgSamplesPerSec=6.341589086574736, CurrSamplesPerSec=5.685052983667712, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:01:58,324] [INFO] [timer.py:197:stop] 0/1508, RunningAvgSamplesPerSec=6.341588805379671, CurrSamplesPerSec=5.714745899200174, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:02:09,883] [INFO] [timer.py:197:stop] 0/1510, RunningAvgSamplesPerSec=6.341402469690666, CurrSamplesPerSec=5.49238308723314, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:02:21,434] [INFO] [timer.py:197:stop] 0/1512, RunningAvgSamplesPerSec=6.341350248424772, CurrSamplesPerSec=5.64702999565295, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:02:32,771] [INFO] [timer.py:197:stop] 0/1514, RunningAvgSamplesPerSec=6.341346331910384, CurrSamplesPerSec=5.697509184428209, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:02:44,097] [INFO] [timer.py:197:stop] 0/1516, RunningAvgSamplesPerSec=6.341353366978005, CurrSamplesPerSec=5.702388003556426, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:02:55,459] [INFO] [timer.py:197:stop] 0/1518, RunningAvgSamplesPerSec=6.341341164701243, CurrSamplesPerSec=5.680299927621715, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:03:06,830] [INFO] [logging.py:68:log_dist] [Rank 0] step=760, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:03:06,831] [INFO] [timer.py:197:stop] 0/1520, RunningAvgSamplesPerSec=6.341327370475824, CurrSamplesPerSec=5.696119324643216, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:03:18,237] [INFO] [timer.py:197:stop] 0/1522, RunningAvgSamplesPerSec=6.3412688547482485, CurrSamplesPerSec=5.640309421300005, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:03:29,836] [INFO] [timer.py:197:stop] 0/1524, RunningAvgSamplesPerSec=6.341192736338762, CurrSamplesPerSec=5.62095345330348, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:03:41,247] [INFO] [timer.py:197:stop] 0/1526, RunningAvgSamplesPerSec=6.3411631021869335, CurrSamplesPerSec=5.681983940905689, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:03:52,544] [INFO] [timer.py:197:stop] 0/1528, RunningAvgSamplesPerSec=6.341152967024458, CurrSamplesPerSec=5.692563859733904, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:04:03,975] [INFO] [timer.py:197:stop] 0/1530, RunningAvgSamplesPerSec=6.341126463250525, CurrSamplesPerSec=5.688522400154612, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:04:15,350] [INFO] [timer.py:197:stop] 0/1532, RunningAvgSamplesPerSec=6.3411357711857494, CurrSamplesPerSec=5.722231561425265, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:04:26,819] [INFO] [timer.py:197:stop] 0/1534, RunningAvgSamplesPerSec=6.341032355356612, CurrSamplesPerSec=5.592960434656473, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:04:38,309] [INFO] [timer.py:197:stop] 0/1536, RunningAvgSamplesPerSec=6.341019027499484, CurrSamplesPerSec=5.7054393583614305, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:04:49,822] [INFO] [timer.py:197:stop] 0/1538, RunningAvgSamplesPerSec=6.340996967931803, CurrSamplesPerSec=5.67102990975972, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:05:01,167] [INFO] [logging.py:68:log_dist] [Rank 0] step=770, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:05:01,168] [INFO] [timer.py:197:stop] 0/1540, RunningAvgSamplesPerSec=6.3409852431183245, CurrSamplesPerSec=5.704927420125413, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:05:12,589] [INFO] [timer.py:197:stop] 0/1542, RunningAvgSamplesPerSec=6.340993330961346, CurrSamplesPerSec=5.696307162711856, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:05:23,944] [INFO] [timer.py:197:stop] 0/1544, RunningAvgSamplesPerSec=6.340975620108428, CurrSamplesPerSec=5.693175005359446, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:05:35,280] [INFO] [timer.py:197:stop] 0/1546, RunningAvgSamplesPerSec=6.340972755518048, CurrSamplesPerSec=5.672557387435001, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:05:46,641] [INFO] [timer.py:197:stop] 0/1548, RunningAvgSamplesPerSec=6.340963188590646, CurrSamplesPerSec=5.6946083679150945, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:05:58,057] [INFO] [timer.py:197:stop] 0/1550, RunningAvgSamplesPerSec=6.3409774774642935, CurrSamplesPerSec=5.707322024491383, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.018, 'learning_rate': 1e-05, 'epoch': 3.28} |
|
[2022-12-16 15:06:09,366] [INFO] [timer.py:197:stop] 0/1552, RunningAvgSamplesPerSec=6.340983462641784, CurrSamplesPerSec=5.684134233724698, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:06:20,722] [INFO] [timer.py:197:stop] 0/1554, RunningAvgSamplesPerSec=6.340964370973567, CurrSamplesPerSec=5.6702874401091625, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:06:32,034] [INFO] [timer.py:197:stop] 0/1556, RunningAvgSamplesPerSec=6.34098163609854, CurrSamplesPerSec=5.711575451265504, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:06:43,575] [INFO] [timer.py:197:stop] 0/1558, RunningAvgSamplesPerSec=6.3409544491119885, CurrSamplesPerSec=5.653666341686975, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:06:54,908] [INFO] [logging.py:68:log_dist] [Rank 0] step=780, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:06:54,909] [INFO] [timer.py:197:stop] 0/1560, RunningAvgSamplesPerSec=6.340939628127967, CurrSamplesPerSec=5.696208769797546, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:07:06,270] [INFO] [timer.py:197:stop] 0/1562, RunningAvgSamplesPerSec=6.340902908469031, CurrSamplesPerSec=5.646739911729805, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:07:17,608] [INFO] [timer.py:197:stop] 0/1564, RunningAvgSamplesPerSec=6.340920513538756, CurrSamplesPerSec=5.727755537884269, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:07:28,943] [INFO] [timer.py:197:stop] 0/1566, RunningAvgSamplesPerSec=6.340918573203408, CurrSamplesPerSec=5.710421423215805, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:07:40,276] [INFO] [timer.py:197:stop] 0/1568, RunningAvgSamplesPerSec=6.340909584614333, CurrSamplesPerSec=5.693910437455586, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:07:51,731] [INFO] [timer.py:197:stop] 0/1570, RunningAvgSamplesPerSec=6.340812940881451, CurrSamplesPerSec=5.600275587292008, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:08:03,160] [INFO] [timer.py:197:stop] 0/1572, RunningAvgSamplesPerSec=6.340797815746698, CurrSamplesPerSec=5.6831344435491795, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:08:14,551] [INFO] [timer.py:197:stop] 0/1574, RunningAvgSamplesPerSec=6.3407527776718755, CurrSamplesPerSec=5.678339669712356, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:08:26,109] [INFO] [timer.py:197:stop] 0/1576, RunningAvgSamplesPerSec=6.340728823079258, CurrSamplesPerSec=5.681213350273135, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:08:37,451] [INFO] [timer.py:197:stop] 0/1578, RunningAvgSamplesPerSec=6.340723657680421, CurrSamplesPerSec=5.703325266587802, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:08:48,816] [INFO] [logging.py:68:log_dist] [Rank 0] step=790, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:08:48,818] [INFO] [timer.py:197:stop] 0/1580, RunningAvgSamplesPerSec=6.340702822409906, CurrSamplesPerSec=5.6887936454989125, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:09:00,327] [INFO] [timer.py:197:stop] 0/1582, RunningAvgSamplesPerSec=6.340607479790234, CurrSamplesPerSec=5.687409480116973, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:09:11,689] [INFO] [timer.py:197:stop] 0/1584, RunningAvgSamplesPerSec=6.340585472875107, CurrSamplesPerSec=5.703302970301742, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:09:23,029] [INFO] [timer.py:197:stop] 0/1586, RunningAvgSamplesPerSec=6.340578666142596, CurrSamplesPerSec=5.705081646702652, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:09:34,366] [INFO] [timer.py:197:stop] 0/1588, RunningAvgSamplesPerSec=6.340575615865826, CurrSamplesPerSec=5.719122257106722, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:09:45,710] [INFO] [timer.py:197:stop] 0/1590, RunningAvgSamplesPerSec=6.340566764043695, CurrSamplesPerSec=5.715390779107138, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:09:57,055] [INFO] [timer.py:197:stop] 0/1592, RunningAvgSamplesPerSec=6.340557130296451, CurrSamplesPerSec=5.699683348396006, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:10:08,390] [INFO] [timer.py:197:stop] 0/1594, RunningAvgSamplesPerSec=6.340557571688335, CurrSamplesPerSec=5.70996810455332, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:10:19,713] [INFO] [timer.py:197:stop] 0/1596, RunningAvgSamplesPerSec=6.3405689893037245, CurrSamplesPerSec=5.716324774516089, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:10:31,023] [INFO] [timer.py:197:stop] 0/1598, RunningAvgSamplesPerSec=6.340562882035166, CurrSamplesPerSec=5.696043419337321, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:10:42,347] [INFO] [logging.py:68:log_dist] [Rank 0] step=800, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:10:42,348] [INFO] [timer.py:197:stop] 0/1600, RunningAvgSamplesPerSec=6.3405693929765485, CurrSamplesPerSec=5.706464238375231, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0174, 'learning_rate': 1e-05, 'epoch': 3.39} |
|
[2022-12-16 15:10:53,716] [INFO] [timer.py:197:stop] 0/1602, RunningAvgSamplesPerSec=6.340545818042899, CurrSamplesPerSec=5.665583903559678, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:11:05,052] [INFO] [timer.py:197:stop] 0/1604, RunningAvgSamplesPerSec=6.340532394750886, CurrSamplesPerSec=5.684705046826125, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:11:16,380] [INFO] [timer.py:197:stop] 0/1606, RunningAvgSamplesPerSec=6.3405379936170565, CurrSamplesPerSec=5.723209769550559, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:11:27,689] [INFO] [timer.py:197:stop] 0/1608, RunningAvgSamplesPerSec=6.340558373519565, CurrSamplesPerSec=5.724374832199237, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:11:39,003] [INFO] [timer.py:197:stop] 0/1610, RunningAvgSamplesPerSec=6.340574932775503, CurrSamplesPerSec=5.714505505743788, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:11:50,368] [INFO] [timer.py:197:stop] 0/1612, RunningAvgSamplesPerSec=6.340552845058058, CurrSamplesPerSec=5.672900481980691, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:12:01,863] [INFO] [timer.py:197:stop] 0/1614, RunningAvgSamplesPerSec=6.340556605308443, CurrSamplesPerSec=5.707677104495976, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:12:13,214] [INFO] [timer.py:197:stop] 0/1616, RunningAvgSamplesPerSec=6.340529980463788, CurrSamplesPerSec=5.693129122625681, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:12:24,544] [INFO] [timer.py:197:stop] 0/1618, RunningAvgSamplesPerSec=6.340522357131446, CurrSamplesPerSec=5.6907620942189405, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:12:35,862] [INFO] [logging.py:68:log_dist] [Rank 0] step=810, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:12:35,863] [INFO] [timer.py:197:stop] 0/1620, RunningAvgSamplesPerSec=6.340536145377171, CurrSamplesPerSec=5.7158288937529, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:12:47,196] [INFO] [timer.py:197:stop] 0/1622, RunningAvgSamplesPerSec=6.3405388983082975, CurrSamplesPerSec=5.697895699489032, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:12:58,558] [INFO] [timer.py:197:stop] 0/1624, RunningAvgSamplesPerSec=6.3405193550861245, CurrSamplesPerSec=5.697626487899726, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:13:09,878] [INFO] [timer.py:197:stop] 0/1626, RunningAvgSamplesPerSec=6.340530865823653, CurrSamplesPerSec=5.716477426708531, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:13:21,183] [INFO] [timer.py:197:stop] 0/1628, RunningAvgSamplesPerSec=6.340541185528216, CurrSamplesPerSec=5.716781538069418, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:13:32,508] [INFO] [timer.py:197:stop] 0/1630, RunningAvgSamplesPerSec=6.340549925763438, CurrSamplesPerSec=5.704277626137134, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:13:43,844] [INFO] [timer.py:197:stop] 0/1632, RunningAvgSamplesPerSec=6.34055012552681, CurrSamplesPerSec=5.70801232308509, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:13:55,123] [INFO] [timer.py:197:stop] 0/1634, RunningAvgSamplesPerSec=6.3405811966256245, CurrSamplesPerSec=5.737563784561477, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:14:06,435] [INFO] [timer.py:197:stop] 0/1636, RunningAvgSamplesPerSec=6.340600660492391, CurrSamplesPerSec=5.7175949331930624, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:14:17,751] [INFO] [timer.py:197:stop] 0/1638, RunningAvgSamplesPerSec=6.340616583305436, CurrSamplesPerSec=5.731508568669221, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:14:29,026] [INFO] [logging.py:68:log_dist] [Rank 0] step=820, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:14:29,028] [INFO] [timer.py:197:stop] 0/1640, RunningAvgSamplesPerSec=6.340639941750262, CurrSamplesPerSec=5.7271948644686015, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:14:40,485] [INFO] [timer.py:197:stop] 0/1642, RunningAvgSamplesPerSec=6.340665258371482, CurrSamplesPerSec=5.7339821304526515, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:14:51,789] [INFO] [timer.py:197:stop] 0/1644, RunningAvgSamplesPerSec=6.340689499225285, CurrSamplesPerSec=5.729058170773215, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:15:03,134] [INFO] [timer.py:197:stop] 0/1646, RunningAvgSamplesPerSec=6.3406821277773915, CurrSamplesPerSec=5.714029644744564, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:15:14,464] [INFO] [timer.py:197:stop] 0/1648, RunningAvgSamplesPerSec=6.3406864146534705, CurrSamplesPerSec=5.720165223602495, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:15:25,781] [INFO] [timer.py:197:stop] 0/1650, RunningAvgSamplesPerSec=6.3407000988458115, CurrSamplesPerSec=5.725634404589424, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0192, 'learning_rate': 1e-05, 'epoch': 3.5} |
|
[2022-12-16 15:15:37,132] [INFO] [timer.py:197:stop] 0/1652, RunningAvgSamplesPerSec=6.340688070776378, CurrSamplesPerSec=5.681139525002643, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:15:48,450] [INFO] [timer.py:197:stop] 0/1654, RunningAvgSamplesPerSec=6.3407015706076875, CurrSamplesPerSec=5.712932255317997, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:15:59,771] [INFO] [timer.py:197:stop] 0/1656, RunningAvgSamplesPerSec=6.34071232163291, CurrSamplesPerSec=5.690258334978994, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:16:11,095] [INFO] [timer.py:197:stop] 0/1658, RunningAvgSamplesPerSec=6.340721311658182, CurrSamplesPerSec=5.715881715444265, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:16:22,381] [INFO] [logging.py:68:log_dist] [Rank 0] step=830, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:16:22,383] [INFO] [timer.py:197:stop] 0/1660, RunningAvgSamplesPerSec=6.340756882047056, CurrSamplesPerSec=5.73620530816962, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:16:33,702] [INFO] [timer.py:197:stop] 0/1662, RunningAvgSamplesPerSec=6.340768352793115, CurrSamplesPerSec=5.712284526465004, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:16:44,994] [INFO] [timer.py:197:stop] 0/1664, RunningAvgSamplesPerSec=6.340788107983439, CurrSamplesPerSec=5.718592754145091, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:16:56,373] [INFO] [timer.py:197:stop] 0/1666, RunningAvgSamplesPerSec=6.340797551500921, CurrSamplesPerSec=5.708101171107673, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:17:07,706] [INFO] [timer.py:197:stop] 0/1668, RunningAvgSamplesPerSec=6.340805651338497, CurrSamplesPerSec=5.721101024112321, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:17:19,005] [INFO] [timer.py:197:stop] 0/1670, RunningAvgSamplesPerSec=6.34083286597194, CurrSamplesPerSec=5.724742536840911, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:17:30,358] [INFO] [timer.py:197:stop] 0/1672, RunningAvgSamplesPerSec=6.340835867117041, CurrSamplesPerSec=5.718272858087763, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:17:41,674] [INFO] [timer.py:197:stop] 0/1674, RunningAvgSamplesPerSec=6.34085070644047, CurrSamplesPerSec=5.728282094031116, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:17:52,982] [INFO] [timer.py:197:stop] 0/1676, RunningAvgSamplesPerSec=6.340870906975045, CurrSamplesPerSec=5.727989958307704, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:18:04,315] [INFO] [timer.py:197:stop] 0/1678, RunningAvgSamplesPerSec=6.340874137995883, CurrSamplesPerSec=5.6954224738759125, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:18:15,671] [INFO] [logging.py:68:log_dist] [Rank 0] step=840, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:18:15,673] [INFO] [timer.py:197:stop] 0/1680, RunningAvgSamplesPerSec=6.340857783353727, CurrSamplesPerSec=5.6949985974417725, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:18:27,057] [INFO] [timer.py:197:stop] 0/1682, RunningAvgSamplesPerSec=6.340822690011218, CurrSamplesPerSec=5.6497331758688745, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:18:38,349] [INFO] [timer.py:197:stop] 0/1684, RunningAvgSamplesPerSec=6.340832385934641, CurrSamplesPerSec=5.7044698818484925, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:18:49,705] [INFO] [timer.py:197:stop] 0/1686, RunningAvgSamplesPerSec=6.340817948806496, CurrSamplesPerSec=5.689992498002407, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:19:01,049] [INFO] [timer.py:197:stop] 0/1688, RunningAvgSamplesPerSec=6.340812410463375, CurrSamplesPerSec=5.711610451189968, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:19:12,368] [INFO] [timer.py:197:stop] 0/1690, RunningAvgSamplesPerSec=6.340822290105834, CurrSamplesPerSec=5.712652381474077, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:19:23,713] [INFO] [timer.py:197:stop] 0/1692, RunningAvgSamplesPerSec=6.340802834576483, CurrSamplesPerSec=5.702006208078319, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:19:35,035] [INFO] [timer.py:197:stop] 0/1694, RunningAvgSamplesPerSec=6.340800674912201, CurrSamplesPerSec=5.70051779093254, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:19:46,368] [INFO] [timer.py:197:stop] 0/1696, RunningAvgSamplesPerSec=6.340802979386393, CurrSamplesPerSec=5.71140458973723, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:19:57,706] [INFO] [timer.py:197:stop] 0/1698, RunningAvgSamplesPerSec=6.340799458449714, CurrSamplesPerSec=5.697038084295858, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:20:09,092] [INFO] [logging.py:68:log_dist] [Rank 0] step=850, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:20:09,094] [INFO] [timer.py:197:stop] 0/1700, RunningAvgSamplesPerSec=6.340776519732791, CurrSamplesPerSec=5.693090968073443, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0172, 'learning_rate': 1e-05, 'epoch': 3.6} |
|
[2022-12-16 15:20:20,415] [INFO] [timer.py:197:stop] 0/1702, RunningAvgSamplesPerSec=6.340774932475345, CurrSamplesPerSec=5.696837624348786, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:20:31,736] [INFO] [timer.py:197:stop] 0/1704, RunningAvgSamplesPerSec=6.340782842982672, CurrSamplesPerSec=5.710392511677529, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:20:43,066] [INFO] [timer.py:197:stop] 0/1706, RunningAvgSamplesPerSec=6.340784748235628, CurrSamplesPerSec=5.723442353636006, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:20:54,378] [INFO] [timer.py:197:stop] 0/1708, RunningAvgSamplesPerSec=6.340788744151501, CurrSamplesPerSec=5.710971284076315, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:21:05,705] [INFO] [timer.py:197:stop] 0/1710, RunningAvgSamplesPerSec=6.340796318366405, CurrSamplesPerSec=5.712790734447396, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:21:17,043] [INFO] [timer.py:197:stop] 0/1712, RunningAvgSamplesPerSec=6.340794253176509, CurrSamplesPerSec=5.705561839441021, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:21:28,393] [INFO] [timer.py:197:stop] 0/1714, RunningAvgSamplesPerSec=6.340780816552236, CurrSamplesPerSec=5.709888186057529, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:21:39,736] [INFO] [timer.py:197:stop] 0/1716, RunningAvgSamplesPerSec=6.340773996797511, CurrSamplesPerSec=5.692864949062375, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:21:51,084] [INFO] [timer.py:197:stop] 0/1718, RunningAvgSamplesPerSec=6.3407637629341504, CurrSamplesPerSec=5.68697932560317, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:22:02,431] [INFO] [logging.py:68:log_dist] [Rank 0] step=860, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:22:02,432] [INFO] [timer.py:197:stop] 0/1720, RunningAvgSamplesPerSec=6.340755202209772, CurrSamplesPerSec=5.697098055754913, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:22:13,758] [INFO] [timer.py:197:stop] 0/1722, RunningAvgSamplesPerSec=6.340751182900368, CurrSamplesPerSec=5.707358185779483, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:22:25,078] [INFO] [timer.py:197:stop] 0/1724, RunningAvgSamplesPerSec=6.3407617876526725, CurrSamplesPerSec=5.698471699382909, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:22:36,432] [INFO] [timer.py:197:stop] 0/1726, RunningAvgSamplesPerSec=6.340748989367053, CurrSamplesPerSec=5.697477984936619, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:22:47,756] [INFO] [timer.py:197:stop] 0/1728, RunningAvgSamplesPerSec=6.340759082484556, CurrSamplesPerSec=5.723073351794558, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:22:59,083] [INFO] [timer.py:197:stop] 0/1730, RunningAvgSamplesPerSec=6.340763571016226, CurrSamplesPerSec=5.715076594904665, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:23:10,419] [INFO] [timer.py:197:stop] 0/1732, RunningAvgSamplesPerSec=6.340760591870521, CurrSamplesPerSec=5.720671366374499, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:23:21,765] [INFO] [timer.py:197:stop] 0/1734, RunningAvgSamplesPerSec=6.3407506244159615, CurrSamplesPerSec=5.702110615341308, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:23:33,078] [INFO] [timer.py:197:stop] 0/1736, RunningAvgSamplesPerSec=6.340764084573826, CurrSamplesPerSec=5.722994042135287, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:23:44,427] [INFO] [timer.py:197:stop] 0/1738, RunningAvgSamplesPerSec=6.340759095569804, CurrSamplesPerSec=5.703108612234952, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:23:55,739] [INFO] [logging.py:68:log_dist] [Rank 0] step=870, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:23:55,740] [INFO] [timer.py:197:stop] 0/1740, RunningAvgSamplesPerSec=6.340772651542185, CurrSamplesPerSec=5.708757664068845, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:24:07,134] [INFO] [timer.py:197:stop] 0/1742, RunningAvgSamplesPerSec=6.340728237008029, CurrSamplesPerSec=5.6599414049939645, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:24:18,462] [INFO] [timer.py:197:stop] 0/1744, RunningAvgSamplesPerSec=6.340720477023084, CurrSamplesPerSec=5.686679099443914, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:24:29,834] [INFO] [timer.py:197:stop] 0/1746, RunningAvgSamplesPerSec=6.340691354009204, CurrSamplesPerSec=5.671267617798262, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:24:41,199] [INFO] [timer.py:197:stop] 0/1748, RunningAvgSamplesPerSec=6.34066862116125, CurrSamplesPerSec=5.671456935833448, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:24:52,552] [INFO] [timer.py:197:stop] 0/1750, RunningAvgSamplesPerSec=6.340654665427781, CurrSamplesPerSec=5.697200106865696, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0187, 'learning_rate': 1e-05, 'epoch': 3.71} |
|
[2022-12-16 15:25:03,919] [INFO] [timer.py:197:stop] 0/1752, RunningAvgSamplesPerSec=6.340650056369047, CurrSamplesPerSec=5.699635424390096, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:25:15,329] [INFO] [timer.py:197:stop] 0/1754, RunningAvgSamplesPerSec=6.340586232602359, CurrSamplesPerSec=5.617304973847408, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:25:26,838] [INFO] [timer.py:197:stop] 0/1756, RunningAvgSamplesPerSec=6.34054369343057, CurrSamplesPerSec=5.63907597079815, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:25:38,202] [INFO] [timer.py:197:stop] 0/1758, RunningAvgSamplesPerSec=6.340520750611476, CurrSamplesPerSec=5.692025262129367, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:25:49,546] [INFO] [logging.py:68:log_dist] [Rank 0] step=880, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:25:49,548] [INFO] [timer.py:197:stop] 0/1760, RunningAvgSamplesPerSec=6.340509283801745, CurrSamplesPerSec=5.699267065755165, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:26:00,858] [INFO] [timer.py:197:stop] 0/1762, RunningAvgSamplesPerSec=6.340512797895522, CurrSamplesPerSec=5.690896252043385, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:26:12,227] [INFO] [timer.py:197:stop] 0/1764, RunningAvgSamplesPerSec=6.340485387877073, CurrSamplesPerSec=5.685304873174933, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:26:23,560] [INFO] [timer.py:197:stop] 0/1766, RunningAvgSamplesPerSec=6.340484955568013, CurrSamplesPerSec=5.709429366012479, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:26:34,873] [INFO] [timer.py:197:stop] 0/1768, RunningAvgSamplesPerSec=6.3404757888004575, CurrSamplesPerSec=5.709597923410203, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:26:46,218] [INFO] [timer.py:197:stop] 0/1770, RunningAvgSamplesPerSec=6.340500276138529, CurrSamplesPerSec=5.719488556056539, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:26:57,528] [INFO] [timer.py:197:stop] 0/1772, RunningAvgSamplesPerSec=6.340494914187493, CurrSamplesPerSec=5.700560403199792, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:27:08,873] [INFO] [timer.py:197:stop] 0/1774, RunningAvgSamplesPerSec=6.340475694361281, CurrSamplesPerSec=5.6971311856910365, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:27:20,208] [INFO] [timer.py:197:stop] 0/1776, RunningAvgSamplesPerSec=6.340464658491628, CurrSamplesPerSec=5.688927469397443, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:27:31,527] [INFO] [timer.py:197:stop] 0/1778, RunningAvgSamplesPerSec=6.340466106938646, CurrSamplesPerSec=5.696105062026932, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:27:42,858] [INFO] [logging.py:68:log_dist] [Rank 0] step=890, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:27:42,859] [INFO] [timer.py:197:stop] 0/1780, RunningAvgSamplesPerSec=6.340468950864724, CurrSamplesPerSec=5.7109836772115425, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:27:54,179] [INFO] [timer.py:197:stop] 0/1782, RunningAvgSamplesPerSec=6.34048081034785, CurrSamplesPerSec=5.718655129518526, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:28:05,546] [INFO] [timer.py:197:stop] 0/1784, RunningAvgSamplesPerSec=6.3404639178617, CurrSamplesPerSec=5.689838603736941, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:28:16,891] [INFO] [timer.py:197:stop] 0/1786, RunningAvgSamplesPerSec=6.340447917585286, CurrSamplesPerSec=5.701282729620904, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:28:28,169] [INFO] [timer.py:197:stop] 0/1788, RunningAvgSamplesPerSec=6.340477045430172, CurrSamplesPerSec=5.719102274082581, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:28:39,479] [INFO] [timer.py:197:stop] 0/1790, RunningAvgSamplesPerSec=6.340496164823028, CurrSamplesPerSec=5.7324916652807145, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:28:50,816] [INFO] [timer.py:197:stop] 0/1792, RunningAvgSamplesPerSec=6.340487880983273, CurrSamplesPerSec=5.698580332321764, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:29:02,190] [INFO] [timer.py:197:stop] 0/1794, RunningAvgSamplesPerSec=6.340450777396085, CurrSamplesPerSec=5.665844833683194, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:29:13,517] [INFO] [timer.py:197:stop] 0/1796, RunningAvgSamplesPerSec=6.340458415597113, CurrSamplesPerSec=5.732360435673601, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:29:24,826] [INFO] [timer.py:197:stop] 0/1798, RunningAvgSamplesPerSec=6.340466973403078, CurrSamplesPerSec=5.70002295421367, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:29:36,125] [INFO] [logging.py:68:log_dist] [Rank 0] step=900, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:29:36,127] [INFO] [timer.py:197:stop] 0/1800, RunningAvgSamplesPerSec=6.340469889682115, CurrSamplesPerSec=5.709452681716447, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0204, 'learning_rate': 1e-05, 'epoch': 3.81} |
|
[2022-12-16 15:29:47,449] [INFO] [timer.py:197:stop] 0/1802, RunningAvgSamplesPerSec=6.340470448543859, CurrSamplesPerSec=5.705118749623552, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:29:58,796] [INFO] [timer.py:197:stop] 0/1804, RunningAvgSamplesPerSec=6.340465008023673, CurrSamplesPerSec=5.704326113114681, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:30:10,126] [INFO] [timer.py:197:stop] 0/1806, RunningAvgSamplesPerSec=6.340469993364399, CurrSamplesPerSec=5.716778859605554, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:30:21,486] [INFO] [timer.py:197:stop] 0/1808, RunningAvgSamplesPerSec=6.340455654310352, CurrSamplesPerSec=5.696311030807412, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:30:32,902] [INFO] [timer.py:197:stop] 0/1810, RunningAvgSamplesPerSec=6.340440008637558, CurrSamplesPerSec=5.69354861493453, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:30:44,244] [INFO] [timer.py:197:stop] 0/1812, RunningAvgSamplesPerSec=6.3404269337809795, CurrSamplesPerSec=5.687951302311593, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:30:55,551] [INFO] [timer.py:197:stop] 0/1814, RunningAvgSamplesPerSec=6.340436811922203, CurrSamplesPerSec=5.7140992186554405, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:31:06,943] [INFO] [timer.py:197:stop] 0/1816, RunningAvgSamplesPerSec=6.3403996757853776, CurrSamplesPerSec=5.67528506823798, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:31:18,308] [INFO] [timer.py:197:stop] 0/1818, RunningAvgSamplesPerSec=6.340382725721073, CurrSamplesPerSec=5.690074996474909, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:31:29,646] [INFO] [logging.py:68:log_dist] [Rank 0] step=910, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:31:29,648] [INFO] [timer.py:197:stop] 0/1820, RunningAvgSamplesPerSec=6.340380453070602, CurrSamplesPerSec=5.697219453460077, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:31:40,982] [INFO] [timer.py:197:stop] 0/1822, RunningAvgSamplesPerSec=6.340372173208823, CurrSamplesPerSec=5.696034958680131, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:31:52,366] [INFO] [timer.py:197:stop] 0/1824, RunningAvgSamplesPerSec=6.340341872616558, CurrSamplesPerSec=5.65803381050704, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:32:03,713] [INFO] [timer.py:197:stop] 0/1826, RunningAvgSamplesPerSec=6.340335362702489, CurrSamplesPerSec=5.68163012630602, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:32:15,088] [INFO] [timer.py:197:stop] 0/1828, RunningAvgSamplesPerSec=6.340311460005998, CurrSamplesPerSec=5.672971455704383, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:32:26,454] [INFO] [timer.py:197:stop] 0/1830, RunningAvgSamplesPerSec=6.340292625240316, CurrSamplesPerSec=5.683508180460664, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:32:37,789] [INFO] [timer.py:197:stop] 0/1832, RunningAvgSamplesPerSec=6.340283810735743, CurrSamplesPerSec=5.711948562823965, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:32:49,111] [INFO] [timer.py:197:stop] 0/1834, RunningAvgSamplesPerSec=6.340296033180755, CurrSamplesPerSec=5.712358677165236, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:33:00,434] [INFO] [timer.py:197:stop] 0/1836, RunningAvgSamplesPerSec=6.340306904575268, CurrSamplesPerSec=5.722238392350369, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:33:11,791] [INFO] [timer.py:197:stop] 0/1838, RunningAvgSamplesPerSec=6.340302653827454, CurrSamplesPerSec=5.7009608927935735, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:33:23,121] [INFO] [logging.py:68:log_dist] [Rank 0] step=920, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:33:23,123] [INFO] [timer.py:197:stop] 0/1840, RunningAvgSamplesPerSec=6.340307186096981, CurrSamplesPerSec=5.721817345090641, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:33:34,450] [INFO] [timer.py:197:stop] 0/1842, RunningAvgSamplesPerSec=6.340310574451815, CurrSamplesPerSec=5.707220095809952, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:33:45,752] [INFO] [timer.py:197:stop] 0/1844, RunningAvgSamplesPerSec=6.340314647155094, CurrSamplesPerSec=5.702791899977914, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:33:57,050] [INFO] [timer.py:197:stop] 0/1846, RunningAvgSamplesPerSec=6.340344920432395, CurrSamplesPerSec=5.73337958161255, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:34:08,381] [INFO] [timer.py:197:stop] 0/1848, RunningAvgSamplesPerSec=6.340351678136677, CurrSamplesPerSec=5.699651398969188, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:34:19,739] [INFO] [timer.py:197:stop] 0/1850, RunningAvgSamplesPerSec=6.340329804278119, CurrSamplesPerSec=5.6642150719709745, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0219, 'learning_rate': 1e-05, 'epoch': 3.92} |
|
[2022-12-16 15:34:31,332] [INFO] [timer.py:197:stop] 0/1852, RunningAvgSamplesPerSec=6.340320173759496, CurrSamplesPerSec=5.685434679637573, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:34:42,641] [INFO] [timer.py:197:stop] 0/1854, RunningAvgSamplesPerSec=6.340341633522805, CurrSamplesPerSec=5.720974461013235, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:34:53,975] [INFO] [timer.py:197:stop] 0/1856, RunningAvgSamplesPerSec=6.340345032511093, CurrSamplesPerSec=5.698232915031294, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:35:05,293] [INFO] [timer.py:197:stop] 0/1858, RunningAvgSamplesPerSec=6.34036156019452, CurrSamplesPerSec=5.730119198060043, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:35:16,627] [INFO] [logging.py:68:log_dist] [Rank 0] step=930, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:35:16,628] [INFO] [timer.py:197:stop] 0/1860, RunningAvgSamplesPerSec=6.340354586223302, CurrSamplesPerSec=5.699637844775112, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:35:28,009] [INFO] [timer.py:197:stop] 0/1862, RunningAvgSamplesPerSec=6.340331703176067, CurrSamplesPerSec=5.7007633044378485, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:35:39,364] [INFO] [timer.py:197:stop] 0/1864, RunningAvgSamplesPerSec=6.340320629171417, CurrSamplesPerSec=5.700264551624048, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:35:50,747] [INFO] [timer.py:197:stop] 0/1866, RunningAvgSamplesPerSec=6.340291769112233, CurrSamplesPerSec=5.670955150866777, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:36:02,205] [INFO] [timer.py:197:stop] 0/1868, RunningAvgSamplesPerSec=6.3402612229421065, CurrSamplesPerSec=5.660783350621326, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:36:13,563] [INFO] [timer.py:197:stop] 0/1870, RunningAvgSamplesPerSec=6.3402478378844656, CurrSamplesPerSec=5.695013579455546, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:36:24,928] [INFO] [timer.py:197:stop] 0/1872, RunningAvgSamplesPerSec=6.3402215069873495, CurrSamplesPerSec=5.676158230558184, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:36:36,261] [INFO] [timer.py:197:stop] 0/1874, RunningAvgSamplesPerSec=6.340214184418767, CurrSamplesPerSec=5.703797891921867, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:36:47,636] [INFO] [timer.py:197:stop] 0/1876, RunningAvgSamplesPerSec=6.34017937906379, CurrSamplesPerSec=5.6780811906590385, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:36:58,980] [INFO] [timer.py:197:stop] 0/1878, RunningAvgSamplesPerSec=6.340165382597096, CurrSamplesPerSec=5.690802871737469, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:37:10,344] [INFO] [logging.py:68:log_dist] [Rank 0] step=940, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:37:10,346] [INFO] [timer.py:197:stop] 0/1880, RunningAvgSamplesPerSec=6.340154199169903, CurrSamplesPerSec=5.695819099556473, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:37:21,776] [INFO] [timer.py:197:stop] 0/1882, RunningAvgSamplesPerSec=6.340082275520113, CurrSamplesPerSec=5.59697386324921, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:37:33,153] [INFO] [timer.py:197:stop] 0/1884, RunningAvgSamplesPerSec=6.340077121816984, CurrSamplesPerSec=5.682787222717036, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:37:44,559] [INFO] [timer.py:197:stop] 0/1886, RunningAvgSamplesPerSec=6.340016406206763, CurrSamplesPerSec=5.617691028794858, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:37:53,121] [INFO] [timer.py:197:stop] 0/1888, RunningAvgSamplesPerSec=6.341654344181224, CurrSamplesPerSec=10.16626217811499, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:38:04,512] [INFO] [timer.py:197:stop] 0/1890, RunningAvgSamplesPerSec=6.341630658368521, CurrSamplesPerSec=5.689458727983866, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:38:15,890] [INFO] [timer.py:197:stop] 0/1892, RunningAvgSamplesPerSec=6.341630694542799, CurrSamplesPerSec=5.717106624821156, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:38:27,291] [INFO] [timer.py:197:stop] 0/1894, RunningAvgSamplesPerSec=6.341593704282419, CurrSamplesPerSec=5.663913659259904, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:38:38,668] [INFO] [timer.py:197:stop] 0/1896, RunningAvgSamplesPerSec=6.3415848072494505, CurrSamplesPerSec=5.700920696013412, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:38:50,040] [INFO] [timer.py:197:stop] 0/1898, RunningAvgSamplesPerSec=6.341580824069318, CurrSamplesPerSec=5.683621538588577, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:39:01,468] [INFO] [logging.py:68:log_dist] [Rank 0] step=950, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:39:01,470] [INFO] [timer.py:197:stop] 0/1900, RunningAvgSamplesPerSec=6.34155899470158, CurrSamplesPerSec=5.697557556408557, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0187, 'learning_rate': 1e-05, 'epoch': 4.03} |
|
[2022-12-16 15:39:12,952] [INFO] [timer.py:197:stop] 0/1902, RunningAvgSamplesPerSec=6.34151319605015, CurrSamplesPerSec=5.6264130146308915, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:39:24,327] [INFO] [timer.py:197:stop] 0/1904, RunningAvgSamplesPerSec=6.341484615721599, CurrSamplesPerSec=5.678025462199816, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:39:35,656] [INFO] [timer.py:197:stop] 0/1906, RunningAvgSamplesPerSec=6.341489394394688, CurrSamplesPerSec=5.706058123533021, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:39:46,994] [INFO] [timer.py:197:stop] 0/1908, RunningAvgSamplesPerSec=6.341491713254405, CurrSamplesPerSec=5.715185861867555, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:39:58,386] [INFO] [timer.py:197:stop] 0/1910, RunningAvgSamplesPerSec=6.341472882637041, CurrSamplesPerSec=5.677095294757984, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:40:09,784] [INFO] [timer.py:197:stop] 0/1912, RunningAvgSamplesPerSec=6.3414513606525675, CurrSamplesPerSec=5.6910752998571565, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:40:21,180] [INFO] [timer.py:197:stop] 0/1914, RunningAvgSamplesPerSec=6.341433961090274, CurrSamplesPerSec=5.682636123920992, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:40:32,536] [INFO] [timer.py:197:stop] 0/1916, RunningAvgSamplesPerSec=6.341419848976018, CurrSamplesPerSec=5.69269279063943, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:40:43,871] [INFO] [timer.py:197:stop] 0/1918, RunningAvgSamplesPerSec=6.341421151163785, CurrSamplesPerSec=5.706983975017767, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:40:55,185] [INFO] [logging.py:68:log_dist] [Rank 0] step=960, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:40:55,187] [INFO] [timer.py:197:stop] 0/1920, RunningAvgSamplesPerSec=6.341423927935272, CurrSamplesPerSec=5.699510051257391, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:41:06,555] [INFO] [timer.py:197:stop] 0/1922, RunningAvgSamplesPerSec=6.341407820716116, CurrSamplesPerSec=5.692106129762454, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:41:17,882] [INFO] [timer.py:197:stop] 0/1924, RunningAvgSamplesPerSec=6.34140485881814, CurrSamplesPerSec=5.680903393209991, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:41:29,236] [INFO] [timer.py:197:stop] 0/1926, RunningAvgSamplesPerSec=6.341392016956714, CurrSamplesPerSec=5.700544181409319, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:41:40,560] [INFO] [timer.py:197:stop] 0/1928, RunningAvgSamplesPerSec=6.341396073096842, CurrSamplesPerSec=5.723259067011961, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:41:51,832] [INFO] [timer.py:197:stop] 0/1930, RunningAvgSamplesPerSec=6.341426372801827, CurrSamplesPerSec=5.72995089457395, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:42:03,168] [INFO] [timer.py:197:stop] 0/1932, RunningAvgSamplesPerSec=6.341441799114161, CurrSamplesPerSec=5.716127823844544, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:42:14,459] [INFO] [timer.py:197:stop] 0/1934, RunningAvgSamplesPerSec=6.341471385915023, CurrSamplesPerSec=5.733927748894998, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:42:25,807] [INFO] [timer.py:197:stop] 0/1936, RunningAvgSamplesPerSec=6.341461312162032, CurrSamplesPerSec=5.716151924619451, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:42:37,153] [INFO] [timer.py:197:stop] 0/1938, RunningAvgSamplesPerSec=6.341441825644964, CurrSamplesPerSec=5.688323262262418, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:42:48,497] [INFO] [logging.py:68:log_dist] [Rank 0] step=970, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:42:48,499] [INFO] [timer.py:197:stop] 0/1940, RunningAvgSamplesPerSec=6.341433751250798, CurrSamplesPerSec=5.708959206622437, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:42:59,850] [INFO] [timer.py:197:stop] 0/1942, RunningAvgSamplesPerSec=6.341421168411483, CurrSamplesPerSec=5.700994309825445, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:43:11,167] [INFO] [timer.py:197:stop] 0/1944, RunningAvgSamplesPerSec=6.341437433207248, CurrSamplesPerSec=5.729389057586628, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:43:22,482] [INFO] [timer.py:197:stop] 0/1946, RunningAvgSamplesPerSec=6.341449750940114, CurrSamplesPerSec=5.731650284155366, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:43:33,841] [INFO] [timer.py:197:stop] 0/1948, RunningAvgSamplesPerSec=6.34143681656222, CurrSamplesPerSec=5.707239753200862, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:43:45,211] [INFO] [timer.py:197:stop] 0/1950, RunningAvgSamplesPerSec=6.341424387903559, CurrSamplesPerSec=5.698836567618656, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.0111, 'learning_rate': 1e-05, 'epoch': 4.13} |
|
[2022-12-16 15:43:56,545] [INFO] [timer.py:197:stop] 0/1952, RunningAvgSamplesPerSec=6.341417935741718, CurrSamplesPerSec=5.69639999845513, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:44:07,909] [INFO] [timer.py:197:stop] 0/1954, RunningAvgSamplesPerSec=6.34139789710389, CurrSamplesPerSec=5.689112180831886, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:44:19,241] [INFO] [timer.py:197:stop] 0/1956, RunningAvgSamplesPerSec=6.3414133504784935, CurrSamplesPerSec=5.720193015245668, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:44:30,531] [INFO] [timer.py:197:stop] 0/1958, RunningAvgSamplesPerSec=6.341444321124853, CurrSamplesPerSec=5.728681842652188, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:44:41,837] [INFO] [logging.py:68:log_dist] [Rank 0] step=980, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:44:41,839] [INFO] [timer.py:197:stop] 0/1960, RunningAvgSamplesPerSec=6.341451451493984, CurrSamplesPerSec=5.710415106384128, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:44:53,144] [INFO] [timer.py:197:stop] 0/1962, RunningAvgSamplesPerSec=6.341471134508416, CurrSamplesPerSec=5.711562326404418, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:45:04,445] [INFO] [timer.py:197:stop] 0/1964, RunningAvgSamplesPerSec=6.341493080264879, CurrSamplesPerSec=5.729100721703047, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:45:15,779] [INFO] [timer.py:197:stop] 0/1966, RunningAvgSamplesPerSec=6.34148308742656, CurrSamplesPerSec=5.688469600735318, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:45:27,069] [INFO] [timer.py:197:stop] 0/1968, RunningAvgSamplesPerSec=6.341511108059361, CurrSamplesPerSec=5.731986611130398, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:45:38,375] [INFO] [timer.py:197:stop] 0/1970, RunningAvgSamplesPerSec=6.3415193795566775, CurrSamplesPerSec=5.733448892776753, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:45:49,691] [INFO] [timer.py:197:stop] 0/1972, RunningAvgSamplesPerSec=6.341533700701917, CurrSamplesPerSec=5.739973842519514, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:46:00,986] [INFO] [timer.py:197:stop] 0/1974, RunningAvgSamplesPerSec=6.3415605832917, CurrSamplesPerSec=5.722849338175083, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:46:12,325] [INFO] [timer.py:197:stop] 0/1976, RunningAvgSamplesPerSec=6.341561196651521, CurrSamplesPerSec=5.719616515849108, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:46:23,642] [INFO] [timer.py:197:stop] 0/1978, RunningAvgSamplesPerSec=6.341577111524756, CurrSamplesPerSec=5.718702155602035, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:46:35,049] [INFO] [logging.py:68:log_dist] [Rank 0] step=990, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:46:35,051] [INFO] [timer.py:197:stop] 0/1980, RunningAvgSamplesPerSec=6.341536046420819, CurrSamplesPerSec=5.6339966705043025, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:46:46,373] [INFO] [timer.py:197:stop] 0/1982, RunningAvgSamplesPerSec=6.341540306208283, CurrSamplesPerSec=5.709136721549093, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:46:57,772] [INFO] [timer.py:197:stop] 0/1984, RunningAvgSamplesPerSec=6.341494762804846, CurrSamplesPerSec=5.629541542374531, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:47:09,086] [INFO] [timer.py:197:stop] 0/1986, RunningAvgSamplesPerSec=6.3414999295135335, CurrSamplesPerSec=5.7012602071255065, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:47:20,422] [INFO] [timer.py:197:stop] 0/1988, RunningAvgSamplesPerSec=6.341500411199719, CurrSamplesPerSec=5.715253517003507, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:47:31,766] [INFO] [timer.py:197:stop] 0/1990, RunningAvgSamplesPerSec=6.341498268714996, CurrSamplesPerSec=5.716839977905756, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:47:43,079] [INFO] [timer.py:197:stop] 0/1992, RunningAvgSamplesPerSec=6.341516519800052, CurrSamplesPerSec=5.747599131276496, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:47:54,350] [INFO] [timer.py:197:stop] 0/1994, RunningAvgSamplesPerSec=6.341548967038678, CurrSamplesPerSec=5.74344624333017, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:48:05,695] [INFO] [timer.py:197:stop] 0/1996, RunningAvgSamplesPerSec=6.341542401618161, CurrSamplesPerSec=5.699316435634214, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:48:16,977] [INFO] [timer.py:197:stop] 0/1998, RunningAvgSamplesPerSec=6.341574552805016, CurrSamplesPerSec=5.7254710049596, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
[2022-12-16 15:48:28,278] [INFO] [logging.py:68:log_dist] [Rank 0] step=1000, skipped=5, lr=[1e-05], mom=[[0.9, 0.999]] |
|
[2022-12-16 15:48:28,280] [INFO] [timer.py:197:stop] 0/2000, RunningAvgSamplesPerSec=6.341592336361306, CurrSamplesPerSec=5.702190558521644, MemAllocated=3.0GB, MaxMemAllocated=19.53GB |
|
{'loss': 0.012, 'learning_rate': 1e-05, 'epoch': 4.24} |
|
{'eval_loss': 0.167724609375, 'eval_wer': 10.242905287195448, 'eval_runtime': 2152.7201, 'eval_samples_per_second': 3.583, 'eval_steps_per_second': 0.448, 'epoch': 4.24} |
|
[2022-12-16 16:24:24,591] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! |
|
[2022-12-16 16:24:24,601] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt |
|
[2022-12-16 16:24:24,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt... |
|
[2022-12-16 16:24:28,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt. |
|
[2022-12-16 16:24:28,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[2022-12-16 16:24:43,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2022-12-16 16:24:43,493] [INFO] [engine.py:3269:_save_zero_checkpoint] zero checkpoint saved ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2022-12-16 16:24:43,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! |
|
|