diff --git "a/train.log" "b/train.log" new file mode 100644--- /dev/null +++ "b/train.log" @@ -0,0 +1,8615 @@ +[2024-11-01 00:11:54,199] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 00:11:56,053] [WARNING] [runner.py:212:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +Detected VISIBLE_DEVICES=0,1,2,3 but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed. +[2024-11-01 00:11:56,054] [INFO] [runner.py:585:main] cmd = /home/chanho/.conda/envs/COMEDY/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29504 --enable_each_rank_log=None /home/chanho/Model/Financial_Contest/main.py --model_name_or_path Qwen/Qwen2-7B-Instruct --train_data_path /home/chanho/Model/Financial_Contest/dataset/train.csv --valid_data_path /home/chanho/Model/Financial_Contest/dataset/valid.csv --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --data_output_path /home/chanho/Model/Financial_Contest/output/data --max_seq_len 1536 --learning_rate 1e-5 --weight_decay 0.1 --num_train_epochs 3 --num_train_samples 810675 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 1000 --seed 42 --save_interval 200000 --eval_interval 10000 --output_dir /home/chanho/Model/Financial_Contest/output/2024-11-01-00.11.52 --offload +[2024-11-01 00:11:57,817] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 00:11:59,914] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]} +[2024-11-01 00:11:59,914] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0 +[2024-11-01 00:11:59,914] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1]}) +[2024-11-01 00:11:59,914] [INFO] [launch.py:164:main] dist_world_size=2 +[2024-11-01 00:11:59,914] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1 +[2024-11-01 00:11:59,915] [INFO] [launch.py:256:main] process 2252702 spawned with command: ['/home/chanho/.conda/envs/COMEDY/bin/python', '-u', '/home/chanho/Model/Financial_Contest/main.py', '--local_rank=0', '--model_name_or_path', 'Qwen/Qwen2-7B-Instruct', '--train_data_path', '/home/chanho/Model/Financial_Contest/dataset/train.csv', '--valid_data_path', '/home/chanho/Model/Financial_Contest/dataset/valid.csv', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '2', '--data_output_path', '/home/chanho/Model/Financial_Contest/output/data', '--max_seq_len', '1536', '--learning_rate', '1e-5', '--weight_decay', '0.1', '--num_train_epochs', '3', '--num_train_samples', '810675', '--gradient_accumulation_steps', '1', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '1000', '--seed', '42', '--save_interval', '200000', '--eval_interval', '10000', '--output_dir', '/home/chanho/Model/Financial_Contest/output/2024-11-01-00.11.52', '--offload'] +[2024-11-01 00:11:59,916] [INFO] [launch.py:256:main] process 2252703 spawned with command: ['/home/chanho/.conda/envs/COMEDY/bin/python', '-u', '/home/chanho/Model/Financial_Contest/main.py', '--local_rank=1', '--model_name_or_path', 'Qwen/Qwen2-7B-Instruct', '--train_data_path', '/home/chanho/Model/Financial_Contest/dataset/train.csv', '--valid_data_path', '/home/chanho/Model/Financial_Contest/dataset/valid.csv', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '2', '--data_output_path', '/home/chanho/Model/Financial_Contest/output/data', '--max_seq_len', '1536', '--learning_rate', '1e-5', '--weight_decay', '0.1', '--num_train_epochs', '3', '--num_train_samples', '810675', '--gradient_accumulation_steps', '1', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '1000', '--seed', '42', '--save_interval', '200000', '--eval_interval', '10000', '--output_dir', '/home/chanho/Model/Financial_Contest/output/2024-11-01-00.11.52', '--offload'] +[2024-11-01 00:12:02,642] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 00:12:02,671] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 00:12:05,343] [INFO] [comm.py:652:init_distributed] cdb=None +[2024-11-01 00:12:05,344] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2024-11-01 00:12:05,464] [INFO] [comm.py:652:init_distributed] cdb=None + Loading checkpoint shards: 0%| | 0/4 [00:00 +[2024-11-01 00:13:55,856] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer +[2024-11-01 00:13:55,856] [INFO] [stage_1_and_2.py:148:__init__] Reduce bucket size 200000000 +[2024-11-01 00:13:55,856] [INFO] [stage_1_and_2.py:149:__init__] Allgather bucket size 200000000 +[2024-11-01 00:13:55,856] [INFO] [stage_1_and_2.py:150:__init__] CPU Offload: True +[2024-11-01 00:13:55,856] [INFO] [stage_1_and_2.py:151:__init__] Round robin gradient partitioning: False +wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +[2024-11-01 00:13:56,751] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +[2024-11-01 00:13:56,752] [INFO] [utils.py:782:see_memory_usage] MA 14.68 GB Max_MA 14.68 GB CA 14.81 GB Max_CA 15 GB +[2024-11-01 00:13:56,752] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 39.34 GB, percent = 7.8% +[2024-11-01 00:13:56,939] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +[2024-11-01 00:13:56,940] [INFO] [utils.py:782:see_memory_usage] MA 14.68 GB Max_MA 14.68 GB CA 14.81 GB Max_CA 15 GB +[2024-11-01 00:13:56,940] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 39.36 GB, percent = 7.8% +[2024-11-01 00:13:56,940] [INFO] [stage_1_and_2.py:543:__init__] optimizer state initialized +[2024-11-01 00:13:57,153] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +[2024-11-01 00:13:57,154] [INFO] [utils.py:782:see_memory_usage] MA 14.68 GB Max_MA 14.68 GB CA 14.81 GB Max_CA 15 GB +[2024-11-01 00:13:57,154] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 39.36 GB, percent = 7.8% +[2024-11-01 00:13:57,157] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +[2024-11-01 00:13:57,157] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2024-11-01 00:13:57,157] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2024-11-01 00:13:57,157] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.95)] +[2024-11-01 00:13:57,160] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] amp_enabled .................. False +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] amp_params ................... False +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] bfloat16_enabled ............. True +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] comms_config ................. +[2024-11-01 00:13:57,160] [INFO] [config.py:1003:print] communication_data_type ...... None +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] disable_allgather ............ False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] dump_state ................... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] fp16_auto_cast ............... None +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] fp16_enabled ................. False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] global_rank .................. 0 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 1 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] loss_scale ................... 1.0 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] optimizer_name ............... DeepSpeedCPUAdam +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 1e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.1} +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] pld_enabled .................. False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] pld_params ................... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] scheduler_name ............... cosine +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 1e-05, 'warmup_num_steps': 1000} +[2024-11-01 00:13:57,161] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] sparse_attention ............. None +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] steps_per_print .............. 10 +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] train_batch_size ............. 4 +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 2 +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] world_size ................... 2 +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] zero_enabled ................. True +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2024-11-01 00:13:57,162] [INFO] [config.py:1003:print] zero_optimization_stage ...... 2 +[2024-11-01 00:13:57,162] [INFO] [config.py:989:print_user_config] json = { + "train_batch_size": 4, + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 1, + "steps_per_print": 10, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2.000000e+08, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2.000000e+08, + "contiguous_gradients": true, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + } + }, + "bf16": { + "enabled": true + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false, + "optimizer": { + "type": "DeepSpeedCPUAdam", + "params": { + "lr": 1e-05, + "betas": [0.9, 0.999], + "eps": 1e-08, + "weight_decay": 0.1 + } + }, + "scheduler": { + "type": "cosine", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-05, + "warmup_num_steps": 1000 + } + } +} +wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +wandb: Currently logged in as: cksgh0984 (organization-chanho). Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.18.0 +wandb: Run data is saved locally in /home/chanho/Model/Financial_Contest/wandb/run-20241101_001357-agjc19ic +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run devilish-veil-470 +wandb: ⭐️ View project at https://wandb.ai/organization-chanho/Shared%20Memory +wandb: 🚀 View run at https://wandb.ai/organization-chanho/Shared%20Memory/runs/agjc19ic +wandb: Currently logged in as: cksgh0984 (organization-chanho). Use `wandb login --relogin` to force relogin +Beginning of Epoch 1/3 +wandb: Tracking run with wandb version 0.18.0 +wandb: Run data is saved locally in /home/chanho/Model/Financial_Contest/wandb/run-20241101_001357-aws5932s +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run sinister-spell-471 +wandb: ⭐️ View project at https://wandb.ai/organization-chanho/Shared%20Memory +wandb: 🚀 View run at https://wandb.ai/organization-chanho/Shared%20Memory/runs/aws5932s +Beginning of Epoch 1/3 +[2024-11-01 00:14:14,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[1.0000000000000001e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:14:14,197] [INFO] [timer.py:259:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=2.6452098453373503, CurrSamplesPerSec=2.6613573688746888, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:14:29,376] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[2.0000000000000002e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:14:29,393] [INFO] [timer.py:259:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=2.646115277631168, CurrSamplesPerSec=2.6487326261097013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:14:44,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[3.0000000000000004e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:14:44,660] [INFO] [timer.py:259:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=2.642082077134501, CurrSamplesPerSec=2.636933552647975, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:14:59,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[4.0000000000000003e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:14:59,846] [INFO] [timer.py:259:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=2.6435629368810543, CurrSamplesPerSec=2.656193303523146, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:15:15,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[5.000000000000001e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:15:15,155] [INFO] [timer.py:259:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=2.640846664027087, CurrSamplesPerSec=2.634442954155628, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:15:30,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[6.000000000000001e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:15:30,464] [INFO] [timer.py:259:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=2.6391570408786347, CurrSamplesPerSec=2.665096278231578, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:15:45,710] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[7.000000000000001e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:15:45,713] [INFO] [timer.py:259:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=2.6387394228616268, CurrSamplesPerSec=2.641437666565027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:16:00,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[8.000000000000001e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:16:00,991] [INFO] [timer.py:259:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=2.6380359518148144, CurrSamplesPerSec=2.6439590143973417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:16:16,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.000000000000001e-07], mom=[(0.9, 0.95)] +[2024-11-01 00:16:16,307] [INFO] [timer.py:259:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=2.6364248358344993, CurrSamplesPerSec=2.556075870363237, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:16:31,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[1.0000000000000002e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:16:31,543] [INFO] [timer.py:259:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=2.6367554587127917, CurrSamplesPerSec=2.635687043931292, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:16:46,825] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.1e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:16:46,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=2.6361732488685226, CurrSamplesPerSec=2.635841084853061, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:17:02,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[1.2000000000000002e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:17:02,171] [INFO] [timer.py:259:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=2.635087502692657, CurrSamplesPerSec=2.634164581307341, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:17:17,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[1.3e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:17:17,407] [INFO] [timer.py:259:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=2.63522397132344, CurrSamplesPerSec=2.6496738504789112, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:17:32,693] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[1.4000000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:17:32,714] [INFO] [timer.py:259:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=2.634676154197519, CurrSamplesPerSec=2.638530589517406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:17:48,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[1.5e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:17:48,006] [INFO] [timer.py:259:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=2.6343166270012683, CurrSamplesPerSec=2.6025091070860604, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:18:03,255] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[1.6000000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:18:03,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=2.634240520257191, CurrSamplesPerSec=2.6544765389856053, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:18:18,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[1.7000000000000002e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:18:18,580] [INFO] [timer.py:259:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=2.6338008053966577, CurrSamplesPerSec=2.586307759343132, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:18:33,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[1.8000000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:18:33,880] [INFO] [timer.py:259:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=2.633420730507304, CurrSamplesPerSec=2.6366057586704517, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:18:49,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[1.9000000000000002e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:18:49,194] [INFO] [timer.py:259:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=2.633004511638365, CurrSamplesPerSec=2.6174997654170777, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:19:04,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[2.0000000000000003e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:19:04,478] [INFO] [timer.py:259:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=2.6329849178691442, CurrSamplesPerSec=2.649806930810858, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:19:19,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[2.1000000000000002e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:19:19,715] [INFO] [timer.py:259:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=2.6332650752123583, CurrSamplesPerSec=2.6534114544913505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:19:34,986] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[2.2e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:19:34,988] [INFO] [timer.py:259:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=2.633102094480507, CurrSamplesPerSec=2.6249770769168417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:19:50,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[2.3000000000000004e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:19:50,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=2.632762770769148, CurrSamplesPerSec=2.62847766915859, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:20:05,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[2.4000000000000003e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:20:05,638] [INFO] [timer.py:259:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=2.6324177853119455, CurrSamplesPerSec=2.6342270342864578, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:20:20,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[2.5e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:20:20,913] [INFO] [timer.py:259:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=2.632406320793437, CurrSamplesPerSec=2.6313197033116618, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:20:36,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[2.6e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:20:36,267] [INFO] [timer.py:259:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=2.631807627763473, CurrSamplesPerSec=2.6216385914565614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:20:51,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[2.7000000000000004e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:20:51,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=2.6313469999770347, CurrSamplesPerSec=2.6320743228778762, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:21:06,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[2.8000000000000003e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:21:06,938] [INFO] [timer.py:259:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=2.631131806246773, CurrSamplesPerSec=2.6498136270298387, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:21:22,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[2.9e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:21:22,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=2.630976888695761, CurrSamplesPerSec=2.6320383984611175, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:21:37,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[3e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:21:37,524] [INFO] [timer.py:259:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=2.63094417685999, CurrSamplesPerSec=2.620236249422549, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:21:52,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[3.1000000000000004e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:21:52,806] [INFO] [timer.py:259:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=2.630864565137125, CurrSamplesPerSec=2.646227212656092, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:22:08,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[3.2000000000000003e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:22:08,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=2.6308713039168015, CurrSamplesPerSec=2.580608644593738, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:22:23,365] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[3.3000000000000006e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:22:23,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=2.630922779655412, CurrSamplesPerSec=2.6304636434660456, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:22:38,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[3.4000000000000005e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:22:38,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=2.6306472703104493, CurrSamplesPerSec=2.568516836594075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:22:53,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[3.5e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:22:53,902] [INFO] [timer.py:259:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=2.63106126843767, CurrSamplesPerSec=2.642612614620263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:23:09,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[3.6000000000000003e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:23:09,194] [INFO] [timer.py:259:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=2.6310282526323805, CurrSamplesPerSec=2.619056987854137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:23:24,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[3.7e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:23:24,505] [INFO] [timer.py:259:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=2.6308977523555, CurrSamplesPerSec=2.613496117533262, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:23:39,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[3.8000000000000005e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:23:39,924] [INFO] [timer.py:259:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=2.6302774233632396, CurrSamplesPerSec=2.636275560680403, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:23:55,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[3.900000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:23:55,237] [INFO] [timer.py:259:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=2.6301296509519556, CurrSamplesPerSec=2.6491186563590188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:24:10,575] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[4.000000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:24:10,582] [INFO] [timer.py:259:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=2.629887947176043, CurrSamplesPerSec=2.6377544310739682, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:24:25,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[4.1e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:24:25,857] [INFO] [timer.py:259:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=2.6299249878617554, CurrSamplesPerSec=2.6282289641499887, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:24:41,194] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[4.2000000000000004e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:24:41,196] [INFO] [timer.py:259:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=2.629743000661228, CurrSamplesPerSec=2.637016446462378, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:24:56,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[4.3e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:24:56,501] [INFO] [timer.py:259:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=2.6296892166059154, CurrSamplesPerSec=2.621419030980815, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:25:11,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[4.4e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:25:11,799] [INFO] [timer.py:259:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=2.6296278489312925, CurrSamplesPerSec=2.6340132174992656, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:25:27,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[4.5e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:25:27,089] [INFO] [timer.py:259:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=2.629587575150157, CurrSamplesPerSec=2.620627525904718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:25:42,437] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[4.600000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:25:42,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=2.6293332247924295, CurrSamplesPerSec=2.63373741616587, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:25:57,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[4.7e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:25:57,744] [INFO] [timer.py:259:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=2.6292996369243244, CurrSamplesPerSec=2.5896183671809707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:26:13,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[4.800000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:26:13,098] [INFO] [timer.py:259:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=2.6290503367258182, CurrSamplesPerSec=2.632048721468972, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:26:28,359] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[4.9000000000000005e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:26:28,360] [INFO] [timer.py:259:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=2.62909784852631, CurrSamplesPerSec=2.6226725764866092, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:26:43,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[5e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:26:43,673] [INFO] [timer.py:259:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=2.629035062218647, CurrSamplesPerSec=2.622812388916804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:26:58,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[5.1e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:26:58,998] [INFO] [timer.py:259:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=2.628944987008803, CurrSamplesPerSec=2.5899089933956123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:27:14,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[5.2e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:27:14,323] [INFO] [timer.py:259:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=2.6289134938680045, CurrSamplesPerSec=2.6306451228721643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:27:29,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[5.300000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:27:29,652] [INFO] [timer.py:259:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=2.6288599751728525, CurrSamplesPerSec=2.603215379547533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:27:44,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[5.400000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:27:44,949] [INFO] [timer.py:259:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=2.628932542496205, CurrSamplesPerSec=2.6584122023720007, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:28:00,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[5.500000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:28:00,247] [INFO] [timer.py:259:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=2.62889932268037, CurrSamplesPerSec=2.6282701372248316, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:28:15,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[5.600000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:28:15,647] [INFO] [timer.py:259:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=2.628586639955244, CurrSamplesPerSec=2.6388692388834367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:28:30,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[5.7e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:28:30,972] [INFO] [timer.py:259:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=2.6285315455681997, CurrSamplesPerSec=2.6210328412622848, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:28:46,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[5.8e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:28:46,305] [INFO] [timer.py:259:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=2.628437668886466, CurrSamplesPerSec=2.6342646729424026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:29:01,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[5.9e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:29:01,613] [INFO] [timer.py:259:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=2.6283749153899, CurrSamplesPerSec=2.6305960384715092, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:29:16,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[6e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:29:16,896] [INFO] [timer.py:259:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=2.62841166070599, CurrSamplesPerSec=2.627132176786439, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:29:32,187] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[6.1e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:29:32,191] [INFO] [timer.py:259:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=2.628437622095167, CurrSamplesPerSec=2.6362656187363447, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:29:47,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[6.200000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:29:47,521] [INFO] [timer.py:259:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=2.6283601755619013, CurrSamplesPerSec=2.601330820862448, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:30:02,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[6.300000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:30:02,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=2.6283217299725776, CurrSamplesPerSec=2.6444120094798786, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:30:18,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[6.4000000000000006e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:30:18,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=2.628320402370172, CurrSamplesPerSec=2.609961798141172, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:30:33,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[6.5000000000000004e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:30:33,464] [INFO] [timer.py:259:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=2.6282975459339952, CurrSamplesPerSec=2.6229645184931716, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:30:48,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[6.600000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:30:48,783] [INFO] [timer.py:259:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=2.628281308523501, CurrSamplesPerSec=2.600559057167384, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:31:04,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[6.700000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:31:04,075] [INFO] [timer.py:259:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=2.6283556228082943, CurrSamplesPerSec=2.6433195845915893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:31:19,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[6.800000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:31:19,370] [INFO] [timer.py:259:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=2.628348000212683, CurrSamplesPerSec=2.6531085003369763, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:31:34,660] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[6.9e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:31:34,662] [INFO] [timer.py:259:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=2.628359120165543, CurrSamplesPerSec=2.634911729631738, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:31:49,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[7e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:31:49,955] [INFO] [timer.py:259:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=2.6283399383399924, CurrSamplesPerSec=2.636448313929522, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:32:05,162] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[7.100000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:32:05,174] [INFO] [timer.py:259:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=2.6285196274537825, CurrSamplesPerSec=2.627367919100581, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:32:20,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=0, lr=[7.2000000000000005e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:32:20,451] [INFO] [timer.py:259:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=2.6285549282281155, CurrSamplesPerSec=2.641579486908556, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:32:35,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=0, lr=[7.3e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:32:35,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=2.628605204755505, CurrSamplesPerSec=2.650787450078778, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:32:51,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=0, lr=[7.4e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:32:51,005] [INFO] [timer.py:259:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=2.6286658904232225, CurrSamplesPerSec=2.6139961581418683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:33:06,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=0, lr=[7.500000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:33:06,275] [INFO] [timer.py:259:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=2.6287158496955327, CurrSamplesPerSec=2.631774157027502, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:33:21,554] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=0, lr=[7.600000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:33:21,555] [INFO] [timer.py:259:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=2.628766822678207, CurrSamplesPerSec=2.630513547806574, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:33:36,852] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=0, lr=[7.7e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:33:36,855] [INFO] [timer.py:259:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=2.6287902455672025, CurrSamplesPerSec=2.6341285998096575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:33:52,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=0, lr=[7.800000000000002e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:33:52,237] [INFO] [timer.py:259:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=2.6287058925990148, CurrSamplesPerSec=2.631228088605528, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:34:07,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=0, lr=[7.9e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:34:07,574] [INFO] [timer.py:259:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=2.628670570384573, CurrSamplesPerSec=2.6025745090136256, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:34:22,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=0, lr=[8.000000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:34:22,905] [INFO] [timer.py:259:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=2.628638133745241, CurrSamplesPerSec=2.597379260698291, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:34:38,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=0, lr=[8.1e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:34:38,219] [INFO] [timer.py:259:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=2.6286540505845033, CurrSamplesPerSec=2.625025541129311, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:34:53,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=0, lr=[8.2e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:34:53,503] [INFO] [timer.py:259:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=2.628726804893921, CurrSamplesPerSec=2.6376466098091953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:35:08,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=0, lr=[8.3e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:35:08,834] [INFO] [timer.py:259:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=2.6287310279811447, CurrSamplesPerSec=2.6340442332559584, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:35:24,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=0, lr=[8.400000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:35:24,105] [INFO] [timer.py:259:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=2.6288142764611164, CurrSamplesPerSec=2.65683392883421, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:35:39,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=0, lr=[8.5e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:35:39,339] [INFO] [timer.py:259:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=2.629011048027878, CurrSamplesPerSec=2.63571023173607, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:35:54,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=0, lr=[8.6e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:35:54,621] [INFO] [timer.py:259:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=2.6290713968105925, CurrSamplesPerSec=2.655840944122863, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:36:09,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=0, lr=[8.700000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:36:09,932] [INFO] [timer.py:259:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=2.629060361652311, CurrSamplesPerSec=2.63139110117687, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:36:25,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=0, lr=[8.8e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:36:25,227] [INFO] [timer.py:259:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=2.6291078962063286, CurrSamplesPerSec=2.656680790022824, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:36:40,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=0, lr=[8.900000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:36:40,544] [INFO] [timer.py:259:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=2.6291383754881172, CurrSamplesPerSec=2.6369157311585583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:36:55,825] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=0, lr=[9e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:36:55,827] [INFO] [timer.py:259:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=2.629251734180274, CurrSamplesPerSec=2.632648833834211, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:37:11,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=0, lr=[9.100000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:37:11,084] [INFO] [timer.py:259:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=2.629365893003863, CurrSamplesPerSec=2.6650776506896046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:37:26,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=0, lr=[9.200000000000002e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:37:26,308] [INFO] [timer.py:259:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=2.6295005173963912, CurrSamplesPerSec=2.6463958459823873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:37:41,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=0, lr=[9.3e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:37:41,576] [INFO] [timer.py:259:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=2.6295609579544204, CurrSamplesPerSec=2.6348264853186403, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:37:56,812] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=0, lr=[9.4e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:37:56,830] [INFO] [timer.py:259:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=2.6296326750912353, CurrSamplesPerSec=2.6535268638124316, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:38:12,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=0, lr=[9.5e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:38:12,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=2.629580148539359, CurrSamplesPerSec=2.6520453516056044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:38:27,532] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=0, lr=[9.600000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:38:27,535] [INFO] [timer.py:259:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=2.6295661777977632, CurrSamplesPerSec=2.6083691385221717, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:38:42,925] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=0, lr=[9.7e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:38:42,929] [INFO] [timer.py:259:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=2.6294538744139238, CurrSamplesPerSec=2.6229013682248223, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:38:58,283] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=0, lr=[9.800000000000001e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:38:58,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=2.629348536565772, CurrSamplesPerSec=2.577115943575459, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:39:13,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=0, lr=[9.9e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:39:13,545] [INFO] [timer.py:259:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=2.6294624803800675, CurrSamplesPerSec=2.655079360027185, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:39:28,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=0, lr=[1e-05], mom=[(0.9, 0.95)] +[2024-11-01 00:39:28,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=2.6295229940316904, CurrSamplesPerSec=2.6115950235964616, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:39:44,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=0, lr=[9.999999716124193e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:39:44,066] [INFO] [timer.py:259:stop] epoch=0/micro_step=1010/global_step=1010, RunningAvgSamplesPerSec=2.6296496384224883, CurrSamplesPerSec=2.654117916860476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:39:59,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=0, lr=[9.999998864496805e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:39:59,344] [INFO] [timer.py:259:stop] epoch=0/micro_step=1020/global_step=1020, RunningAvgSamplesPerSec=2.629667866365327, CurrSamplesPerSec=2.636952617763807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:40:14,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=0, lr=[9.999997445117932e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:40:14,537] [INFO] [timer.py:259:stop] epoch=0/micro_step=1030/global_step=1030, RunningAvgSamplesPerSec=2.629805331937082, CurrSamplesPerSec=2.6512959980345205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:40:29,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=0, lr=[9.999995457987737e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:40:29,785] [INFO] [timer.py:259:stop] epoch=0/micro_step=1040/global_step=1040, RunningAvgSamplesPerSec=2.6298865521478882, CurrSamplesPerSec=2.660028621956679, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:40:44,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=0, lr=[9.999992903106444e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:40:44,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=1050/global_step=1050, RunningAvgSamplesPerSec=2.6300748440818675, CurrSamplesPerSec=2.6453468283127046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:41:00,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=0, lr=[9.999989780474342e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:41:00,225] [INFO] [timer.py:259:stop] epoch=0/micro_step=1060/global_step=1060, RunningAvgSamplesPerSec=2.6300946665263263, CurrSamplesPerSec=2.630037677697805, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:41:15,483] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=0, lr=[9.999986090091787e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:41:15,485] [INFO] [timer.py:259:stop] epoch=0/micro_step=1070/global_step=1070, RunningAvgSamplesPerSec=2.630163311105559, CurrSamplesPerSec=2.6184140158079328, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:41:30,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=0, lr=[9.999981831959199e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:41:30,796] [INFO] [timer.py:259:stop] epoch=0/micro_step=1080/global_step=1080, RunningAvgSamplesPerSec=2.6301459102921387, CurrSamplesPerSec=2.6328293753934164, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:41:46,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=0, lr=[9.999977006077058e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:41:46,003] [INFO] [timer.py:259:stop] epoch=0/micro_step=1090/global_step=1090, RunningAvgSamplesPerSec=2.630265580178872, CurrSamplesPerSec=2.6482752234722113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:42:01,275] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=0, lr=[9.999971612445915e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:42:01,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=1100/global_step=1100, RunningAvgSamplesPerSec=2.6302757513473383, CurrSamplesPerSec=2.634174093821591, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:42:16,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=0, lr=[9.999965651066383e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:42:16,527] [INFO] [timer.py:259:stop] epoch=0/micro_step=1110/global_step=1110, RunningAvgSamplesPerSec=2.630329658885431, CurrSamplesPerSec=2.622766466474682, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:42:31,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=0, lr=[9.999959121939138e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:42:31,813] [INFO] [timer.py:259:stop] epoch=0/micro_step=1120/global_step=1120, RunningAvgSamplesPerSec=2.6303481944070004, CurrSamplesPerSec=2.6245298941868818, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:42:46,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=0, lr=[9.99995202506492e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:42:47,000] [INFO] [timer.py:259:stop] epoch=0/micro_step=1130/global_step=1130, RunningAvgSamplesPerSec=2.630504109674064, CurrSamplesPerSec=2.651171565885162, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:43:02,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=0, lr=[9.999944360444536e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:43:02,202] [INFO] [timer.py:259:stop] epoch=0/micro_step=1140/global_step=1140, RunningAvgSamplesPerSec=2.6306250767493835, CurrSamplesPerSec=2.670366881626615, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:43:17,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=0, lr=[9.999936128078858e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:43:17,533] [INFO] [timer.py:259:stop] epoch=0/micro_step=1150/global_step=1150, RunningAvgSamplesPerSec=2.63053159365646, CurrSamplesPerSec=2.5868712377535834, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:43:32,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=0, lr=[9.999927327968819e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:43:32,792] [INFO] [timer.py:259:stop] epoch=0/micro_step=1160/global_step=1160, RunningAvgSamplesPerSec=2.630563802385863, CurrSamplesPerSec=2.6487669167678662, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:43:48,048] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=0, lr=[9.999917960115416e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:43:48,062] [INFO] [timer.py:259:stop] epoch=0/micro_step=1170/global_step=1170, RunningAvgSamplesPerSec=2.6305620132516756, CurrSamplesPerSec=2.6174332028646554, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:44:03,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=0, lr=[9.999908024519717e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:44:03,346] [INFO] [timer.py:259:stop] epoch=0/micro_step=1180/global_step=1180, RunningAvgSamplesPerSec=2.6305440210938333, CurrSamplesPerSec=2.6359636679130207, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:44:18,601] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=0, lr=[9.999897521182848e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:44:18,605] [INFO] [timer.py:259:stop] epoch=0/micro_step=1190/global_step=1190, RunningAvgSamplesPerSec=2.630576472451352, CurrSamplesPerSec=2.6302924983340255, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:44:33,852] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=0, lr=[9.999886450106e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:44:33,862] [INFO] [timer.py:259:stop] epoch=0/micro_step=1200/global_step=1200, RunningAvgSamplesPerSec=2.6306281550345454, CurrSamplesPerSec=2.6285913314871947, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:44:49,135] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=0, lr=[9.999874811290435e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:44:49,137] [INFO] [timer.py:259:stop] epoch=0/micro_step=1210/global_step=1210, RunningAvgSamplesPerSec=2.6306560330978064, CurrSamplesPerSec=2.6443144794886013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:45:04,415] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=0, lr=[9.99986260473747e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:45:04,416] [INFO] [timer.py:259:stop] epoch=0/micro_step=1220/global_step=1220, RunningAvgSamplesPerSec=2.6306801506377293, CurrSamplesPerSec=2.6412023033601306, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:45:19,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=0, lr=[9.999849830448494e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:45:19,691] [INFO] [timer.py:259:stop] epoch=0/micro_step=1230/global_step=1230, RunningAvgSamplesPerSec=2.6307180372744647, CurrSamplesPerSec=2.648632268309642, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:45:34,952] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=0, lr=[9.999836488424954e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:45:34,954] [INFO] [timer.py:259:stop] epoch=0/micro_step=1240/global_step=1240, RunningAvgSamplesPerSec=2.630767513187602, CurrSamplesPerSec=2.635060713519031, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:45:50,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=0, lr=[9.999822578668368e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:45:50,176] [INFO] [timer.py:259:stop] epoch=0/micro_step=1250/global_step=1250, RunningAvgSamplesPerSec=2.630848552372437, CurrSamplesPerSec=2.659457698134224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:46:05,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=0, lr=[9.999808101180316e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:46:05,432] [INFO] [timer.py:259:stop] epoch=0/micro_step=1260/global_step=1260, RunningAvgSamplesPerSec=2.6308948371396648, CurrSamplesPerSec=2.6470200597387716, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:46:20,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=0, lr=[9.99979305596244e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:46:20,693] [INFO] [timer.py:259:stop] epoch=0/micro_step=1270/global_step=1270, RunningAvgSamplesPerSec=2.630920221720672, CurrSamplesPerSec=2.6284653151499424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:46:35,927] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=0, lr=[9.99977744301645e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:46:35,940] [INFO] [timer.py:259:stop] epoch=0/micro_step=1280/global_step=1280, RunningAvgSamplesPerSec=2.6309757227300614, CurrSamplesPerSec=2.6351376951287104, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:46:51,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=0, lr=[9.999761262344117e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:46:51,268] [INFO] [timer.py:259:stop] epoch=0/micro_step=1290/global_step=1290, RunningAvgSamplesPerSec=2.6309477735904916, CurrSamplesPerSec=2.6383509248432473, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:47:06,549] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=0, lr=[9.999744513947279e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:47:06,550] [INFO] [timer.py:259:stop] epoch=0/micro_step=1300/global_step=1300, RunningAvgSamplesPerSec=2.6309699357736362, CurrSamplesPerSec=2.636833258100296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:47:21,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=0, lr=[9.999727197827837e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:47:21,855] [INFO] [timer.py:259:stop] epoch=0/micro_step=1310/global_step=1310, RunningAvgSamplesPerSec=2.630994830223936, CurrSamplesPerSec=2.6338875077789696, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:47:37,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=0, lr=[9.999709313987758e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:47:37,107] [INFO] [timer.py:259:stop] epoch=0/micro_step=1320/global_step=1320, RunningAvgSamplesPerSec=2.6310373099537543, CurrSamplesPerSec=2.6151178731466893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:47:52,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=0, lr=[9.999690862429075e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:47:52,482] [INFO] [timer.py:259:stop] epoch=0/micro_step=1330/global_step=1330, RunningAvgSamplesPerSec=2.6309452633050645, CurrSamplesPerSec=2.6352958112500073, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:48:07,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=0, lr=[9.99967184315388e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:48:07,716] [INFO] [timer.py:259:stop] epoch=0/micro_step=1340/global_step=1340, RunningAvgSamplesPerSec=2.6310196037653086, CurrSamplesPerSec=2.6211262043800216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:48:22,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=0, lr=[9.999652256164333e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:48:22,961] [INFO] [timer.py:259:stop] epoch=0/micro_step=1350/global_step=1350, RunningAvgSamplesPerSec=2.631061967939248, CurrSamplesPerSec=2.6273230713180746, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:48:38,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=0, lr=[9.999632101462659e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:48:38,242] [INFO] [timer.py:259:stop] epoch=0/micro_step=1360/global_step=1360, RunningAvgSamplesPerSec=2.6310447436907616, CurrSamplesPerSec=2.5946124394203474, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:48:53,452] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=0, lr=[9.999611379051149e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:48:53,453] [INFO] [timer.py:259:stop] epoch=0/micro_step=1370/global_step=1370, RunningAvgSamplesPerSec=2.6311254102876322, CurrSamplesPerSec=2.6550159144012135, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:49:08,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=0, lr=[9.99959008893215e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:49:08,719] [INFO] [timer.py:259:stop] epoch=0/micro_step=1380/global_step=1380, RunningAvgSamplesPerSec=2.6311637258249148, CurrSamplesPerSec=2.603158831331505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:49:23,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=0, lr=[9.999568231108085e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:49:24,000] [INFO] [timer.py:259:stop] epoch=0/micro_step=1390/global_step=1390, RunningAvgSamplesPerSec=2.6311889052967357, CurrSamplesPerSec=2.6554832141853866, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:49:39,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=0, lr=[9.999545805581433e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:49:39,283] [INFO] [timer.py:259:stop] epoch=0/micro_step=1400/global_step=1400, RunningAvgSamplesPerSec=2.6312094216139466, CurrSamplesPerSec=2.633322789138307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:49:54,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=0, lr=[9.999522812354742e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:49:54,513] [INFO] [timer.py:259:stop] epoch=0/micro_step=1410/global_step=1410, RunningAvgSamplesPerSec=2.631274838220606, CurrSamplesPerSec=2.638168795566205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:50:09,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=0, lr=[9.999499251430623e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:50:09,845] [INFO] [timer.py:259:stop] epoch=0/micro_step=1420/global_step=1420, RunningAvgSamplesPerSec=2.6312192653919797, CurrSamplesPerSec=2.633367428648098, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:50:25,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=0, lr=[9.999475122811749e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:50:25,075] [INFO] [timer.py:259:stop] epoch=0/micro_step=1430/global_step=1430, RunningAvgSamplesPerSec=2.631296328686617, CurrSamplesPerSec=2.669354413595818, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:50:40,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=0, lr=[9.99945042650086e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:50:40,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=1440/global_step=1440, RunningAvgSamplesPerSec=2.6313074719245657, CurrSamplesPerSec=2.625508227334958, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:50:55,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=0, lr=[9.999425162500765e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:50:55,579] [INFO] [timer.py:259:stop] epoch=0/micro_step=1450/global_step=1450, RunningAvgSamplesPerSec=2.631377362593193, CurrSamplesPerSec=2.653202064665411, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:51:10,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=0, lr=[9.999399330814328e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:51:10,836] [INFO] [timer.py:259:stop] epoch=0/micro_step=1460/global_step=1460, RunningAvgSamplesPerSec=2.6314047702010805, CurrSamplesPerSec=2.6559489966727825, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:51:26,081] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=0, lr=[9.999372931444484e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:51:26,090] [INFO] [timer.py:259:stop] epoch=0/micro_step=1470/global_step=1470, RunningAvgSamplesPerSec=2.6314249207435907, CurrSamplesPerSec=2.637915764735341, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:51:41,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=0, lr=[9.99934596439423e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:51:41,418] [INFO] [timer.py:259:stop] epoch=0/micro_step=1480/global_step=1480, RunningAvgSamplesPerSec=2.631376788897242, CurrSamplesPerSec=2.63525234812151, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:51:56,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=0, lr=[9.99931842966663e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:51:56,712] [INFO] [timer.py:259:stop] epoch=0/micro_step=1490/global_step=1490, RunningAvgSamplesPerSec=2.6313564357747423, CurrSamplesPerSec=2.6398906933850763, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:52:11,981] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=0, lr=[9.999290327264805e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:52:11,983] [INFO] [timer.py:259:stop] epoch=0/micro_step=1500/global_step=1500, RunningAvgSamplesPerSec=2.631365729623261, CurrSamplesPerSec=2.6444912057909753, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:52:27,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=0, lr=[9.999261657191953e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:52:27,268] [INFO] [timer.py:259:stop] epoch=0/micro_step=1510/global_step=1510, RunningAvgSamplesPerSec=2.6313586601613075, CurrSamplesPerSec=2.634634912646085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:52:42,586] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=0, lr=[9.999232419451325e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:52:42,587] [INFO] [timer.py:259:stop] epoch=0/micro_step=1520/global_step=1520, RunningAvgSamplesPerSec=2.6313237280156465, CurrSamplesPerSec=2.631718012557898, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:52:57,861] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=0, lr=[9.999202614046243e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:52:57,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=1530/global_step=1530, RunningAvgSamplesPerSec=2.6313230331928814, CurrSamplesPerSec=2.613018244523487, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:53:13,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=0, lr=[9.999172240980093e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:53:13,175] [INFO] [timer.py:259:stop] epoch=0/micro_step=1540/global_step=1540, RunningAvgSamplesPerSec=2.6312936560903273, CurrSamplesPerSec=2.6190300036594443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:53:28,415] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=0, lr=[9.999141300256318e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:53:28,417] [INFO] [timer.py:259:stop] epoch=0/micro_step=1550/global_step=1550, RunningAvgSamplesPerSec=2.631327555645516, CurrSamplesPerSec=2.6347044217562186, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:53:43,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=0, lr=[9.99910979187844e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:53:43,721] [INFO] [timer.py:259:stop] epoch=0/micro_step=1560/global_step=1560, RunningAvgSamplesPerSec=2.631308942532298, CurrSamplesPerSec=2.616629819024615, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:53:59,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=0, lr=[9.999077715850027e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:53:59,012] [INFO] [timer.py:259:stop] epoch=0/micro_step=1570/global_step=1570, RunningAvgSamplesPerSec=2.63128753960257, CurrSamplesPerSec=2.5973020568123033, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:54:14,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=0, lr=[9.999045072174727e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:54:14,322] [INFO] [timer.py:259:stop] epoch=0/micro_step=1580/global_step=1580, RunningAvgSamplesPerSec=2.6312705292990866, CurrSamplesPerSec=2.6324802956427966, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:54:29,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=0, lr=[9.999011860856248e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:54:29,591] [INFO] [timer.py:259:stop] epoch=0/micro_step=1590/global_step=1590, RunningAvgSamplesPerSec=2.6312792785924617, CurrSamplesPerSec=2.6302050785713127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:54:44,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=0, lr=[9.998978081898355e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:54:44,809] [INFO] [timer.py:259:stop] epoch=0/micro_step=1600/global_step=1600, RunningAvgSamplesPerSec=2.6313350522054466, CurrSamplesPerSec=2.636642636604024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:55:00,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=0, lr=[9.998943735304891e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:55:00,072] [INFO] [timer.py:259:stop] epoch=0/micro_step=1610/global_step=1610, RunningAvgSamplesPerSec=2.6313330279911393, CurrSamplesPerSec=2.6342923856073504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:55:15,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=0, lr=[9.998908821079753e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:55:15,303] [INFO] [timer.py:259:stop] epoch=0/micro_step=1620/global_step=1620, RunningAvgSamplesPerSec=2.6313693494537698, CurrSamplesPerSec=2.63316243005047, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:55:30,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=0, lr=[9.998873339226905e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:55:30,619] [INFO] [timer.py:259:stop] epoch=0/micro_step=1630/global_step=1630, RunningAvgSamplesPerSec=2.631315676853281, CurrSamplesPerSec=2.642605954746461, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:55:45,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=0, lr=[9.998837289750374e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:55:45,921] [INFO] [timer.py:259:stop] epoch=0/micro_step=1640/global_step=1640, RunningAvgSamplesPerSec=2.631284772561632, CurrSamplesPerSec=2.6373916061290488, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:56:01,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=0, lr=[9.998800672654256e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:56:01,214] [INFO] [timer.py:259:stop] epoch=0/micro_step=1650/global_step=1650, RunningAvgSamplesPerSec=2.6312721027671238, CurrSamplesPerSec=2.6334397644002068, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:56:16,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=0, lr=[9.99876348794271e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:56:16,547] [INFO] [timer.py:259:stop] epoch=0/micro_step=1660/global_step=1660, RunningAvgSamplesPerSec=2.6312287183436425, CurrSamplesPerSec=2.6358294897484544, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:56:31,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=0, lr=[9.998725735619956e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:56:31,873] [INFO] [timer.py:259:stop] epoch=0/micro_step=1670/global_step=1670, RunningAvgSamplesPerSec=2.63118130177013, CurrSamplesPerSec=2.635736732583979, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:56:47,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=0, lr=[9.99868741569028e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:56:47,246] [INFO] [timer.py:259:stop] epoch=0/micro_step=1680/global_step=1680, RunningAvgSamplesPerSec=2.631086007887863, CurrSamplesPerSec=2.6314922203897853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:57:02,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=0, lr=[9.998648528158037e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:57:02,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=1690/global_step=1690, RunningAvgSamplesPerSec=2.631071688523522, CurrSamplesPerSec=2.635635286910206, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:57:17,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=0, lr=[9.99860907302764e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:57:17,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=1700/global_step=1700, RunningAvgSamplesPerSec=2.631043680586781, CurrSamplesPerSec=2.6060580703166067, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:57:33,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=0, lr=[9.998569050303569e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:57:33,097] [INFO] [timer.py:259:stop] epoch=0/micro_step=1710/global_step=1710, RunningAvgSamplesPerSec=2.6310604509245983, CurrSamplesPerSec=2.6552436604755885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:57:48,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=0, lr=[9.99852845999037e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:57:48,433] [INFO] [timer.py:259:stop] epoch=0/micro_step=1720/global_step=1720, RunningAvgSamplesPerSec=2.6310025826630454, CurrSamplesPerSec=2.6095079456936547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:58:03,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=0, lr=[9.99848730209265e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:58:03,741] [INFO] [timer.py:259:stop] epoch=0/micro_step=1730/global_step=1730, RunningAvgSamplesPerSec=2.630981742317034, CurrSamplesPerSec=2.6389356509137767, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:58:19,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=0, lr=[9.998445576615086e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:58:19,079] [INFO] [timer.py:259:stop] epoch=0/micro_step=1740/global_step=1740, RunningAvgSamplesPerSec=2.630923623676534, CurrSamplesPerSec=2.590684849038156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:58:34,370] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=0, lr=[9.998403283562413e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:58:34,377] [INFO] [timer.py:259:stop] epoch=0/micro_step=1750/global_step=1750, RunningAvgSamplesPerSec=2.630908627262128, CurrSamplesPerSec=2.641346593494753, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:58:49,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=0, lr=[9.998360422939432e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:58:49,694] [INFO] [timer.py:259:stop] epoch=0/micro_step=1760/global_step=1760, RunningAvgSamplesPerSec=2.6308739925736324, CurrSamplesPerSec=2.623763180646413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:59:05,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=0, lr=[9.998316994751013e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:59:05,032] [INFO] [timer.py:259:stop] epoch=0/micro_step=1770/global_step=1770, RunningAvgSamplesPerSec=2.630819574407263, CurrSamplesPerSec=2.627100500851658, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:59:20,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=0, lr=[9.998272999002087e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:59:20,291] [INFO] [timer.py:259:stop] epoch=0/micro_step=1780/global_step=1780, RunningAvgSamplesPerSec=2.6308329668041655, CurrSamplesPerSec=2.6292834012212243, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:59:35,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=0, lr=[9.998228435697648e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:59:35,590] [INFO] [timer.py:259:stop] epoch=0/micro_step=1790/global_step=1790, RunningAvgSamplesPerSec=2.6308156751171636, CurrSamplesPerSec=2.639183480310452, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 00:59:50,939] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=0, lr=[9.998183304842759e-06], mom=[(0.9, 0.95)] +[2024-11-01 00:59:50,960] [INFO] [timer.py:259:stop] epoch=0/micro_step=1800/global_step=1800, RunningAvgSamplesPerSec=2.630749228908918, CurrSamplesPerSec=2.6551650794633535, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:00:06,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=1810, skipped=0, lr=[9.99813760644254e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:00:06,277] [INFO] [timer.py:259:stop] epoch=0/micro_step=1810/global_step=1810, RunningAvgSamplesPerSec=2.6307199984768173, CurrSamplesPerSec=2.6320086686509137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:00:21,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=1820, skipped=0, lr=[9.998091340502185e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:00:21,648] [INFO] [timer.py:259:stop] epoch=0/micro_step=1820/global_step=1820, RunningAvgSamplesPerSec=2.6306563378025234, CurrSamplesPerSec=2.6380775329489694, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:00:36,933] [INFO] [logging.py:96:log_dist] [Rank 0] step=1830, skipped=0, lr=[9.998044507026945e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:00:36,936] [INFO] [timer.py:259:stop] epoch=0/micro_step=1830/global_step=1830, RunningAvgSamplesPerSec=2.6306491978694897, CurrSamplesPerSec=2.6185272181162373, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:00:52,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=1840, skipped=0, lr=[9.997997106022136e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:00:52,202] [INFO] [timer.py:259:stop] epoch=0/micro_step=1840/global_step=1840, RunningAvgSamplesPerSec=2.63066292571211, CurrSamplesPerSec=2.6378137365270704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:01:07,491] [INFO] [logging.py:96:log_dist] [Rank 0] step=1850, skipped=0, lr=[9.997949137493146e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:01:07,515] [INFO] [timer.py:259:stop] epoch=0/micro_step=1850/global_step=1850, RunningAvgSamplesPerSec=2.630643294846223, CurrSamplesPerSec=2.615321294701102, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:01:22,851] [INFO] [logging.py:96:log_dist] [Rank 0] step=1860, skipped=0, lr=[9.997900601445417e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:01:22,852] [INFO] [timer.py:259:stop] epoch=0/micro_step=1860/global_step=1860, RunningAvgSamplesPerSec=2.630602942308318, CurrSamplesPerSec=2.6269873787483795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:01:38,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=1870, skipped=0, lr=[9.997851497884461e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:01:38,192] [INFO] [timer.py:259:stop] epoch=0/micro_step=1870/global_step=1870, RunningAvgSamplesPerSec=2.6305615085567178, CurrSamplesPerSec=2.584232624755091, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:01:53,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=1880, skipped=0, lr=[9.997801826815857e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:01:53,480] [INFO] [timer.py:259:stop] epoch=0/micro_step=1880/global_step=1880, RunningAvgSamplesPerSec=2.6305585262989877, CurrSamplesPerSec=2.6265761077553615, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:02:08,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=1890, skipped=0, lr=[9.997751588245241e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:02:08,776] [INFO] [timer.py:259:stop] epoch=0/micro_step=1890/global_step=1890, RunningAvgSamplesPerSec=2.6305508623611136, CurrSamplesPerSec=2.6140723211790413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:02:24,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=1900, skipped=0, lr=[9.997700782178319e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:02:24,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=1900/global_step=1900, RunningAvgSamplesPerSec=2.6305304225046697, CurrSamplesPerSec=2.6435415795606922, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:02:39,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=1910, skipped=0, lr=[9.99764940862086e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:02:39,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=1910/global_step=1910, RunningAvgSamplesPerSec=2.6304667809093116, CurrSamplesPerSec=2.617194340597592, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:02:54,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=1920, skipped=0, lr=[9.9975974675787e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:02:54,738] [INFO] [timer.py:259:stop] epoch=0/micro_step=1920/global_step=1920, RunningAvgSamplesPerSec=2.630466684118292, CurrSamplesPerSec=2.6109646489668132, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:03:10,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=1930, skipped=0, lr=[9.997544959057733e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:03:10,038] [INFO] [timer.py:259:stop] epoch=0/micro_step=1930/global_step=1930, RunningAvgSamplesPerSec=2.630458819435548, CurrSamplesPerSec=2.648030699127795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:03:25,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=1940, skipped=0, lr=[9.997491883063924e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:03:25,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=1940/global_step=1940, RunningAvgSamplesPerSec=2.630425479741062, CurrSamplesPerSec=2.633932166436949, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:03:40,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=1950, skipped=0, lr=[9.997438239603297e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:03:40,691] [INFO] [timer.py:259:stop] epoch=0/micro_step=1950/global_step=1950, RunningAvgSamplesPerSec=2.63039021496398, CurrSamplesPerSec=2.6350416757501476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:03:55,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=1960, skipped=0, lr=[9.997384028681947e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:03:55,991] [INFO] [timer.py:259:stop] epoch=0/micro_step=1960/global_step=1960, RunningAvgSamplesPerSec=2.6303779658480537, CurrSamplesPerSec=2.6187013317441394, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:04:11,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=1970, skipped=0, lr=[9.997329250306028e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:04:11,350] [INFO] [timer.py:259:stop] epoch=0/micro_step=1970/global_step=1970, RunningAvgSamplesPerSec=2.6303022667014986, CurrSamplesPerSec=2.627932556087351, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:04:26,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=1980, skipped=0, lr=[9.997273904481759e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:04:26,636] [INFO] [timer.py:259:stop] epoch=0/micro_step=1980/global_step=1980, RunningAvgSamplesPerSec=2.630298274113017, CurrSamplesPerSec=2.6196106954376956, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:04:41,936] [INFO] [logging.py:96:log_dist] [Rank 0] step=1990, skipped=0, lr=[9.997217991215425e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:04:41,938] [INFO] [timer.py:259:stop] epoch=0/micro_step=1990/global_step=1990, RunningAvgSamplesPerSec=2.6302706032873884, CurrSamplesPerSec=2.6421989336863643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:04:57,289] [INFO] [logging.py:96:log_dist] [Rank 0] step=2000, skipped=0, lr=[9.997161510513377e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:04:57,290] [INFO] [timer.py:259:stop] epoch=0/micro_step=2000/global_step=2000, RunningAvgSamplesPerSec=2.630217563732129, CurrSamplesPerSec=2.6462021699355636, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:05:12,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=2010, skipped=0, lr=[9.997104462382025e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:05:12,599] [INFO] [timer.py:259:stop] epoch=0/micro_step=2010/global_step=2010, RunningAvgSamplesPerSec=2.6302002181742887, CurrSamplesPerSec=2.636626062124224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:05:27,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=2020, skipped=0, lr=[9.99704684682785e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:05:27,933] [INFO] [timer.py:259:stop] epoch=0/micro_step=2020/global_step=2020, RunningAvgSamplesPerSec=2.6301649678512113, CurrSamplesPerSec=2.6459446747885047, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:05:43,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=2030, skipped=0, lr=[9.996988663857393e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:05:43,258] [INFO] [timer.py:259:stop] epoch=0/micro_step=2030/global_step=2030, RunningAvgSamplesPerSec=2.630147892149875, CurrSamplesPerSec=2.630346520054571, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:05:58,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=2040, skipped=0, lr=[9.99692991347726e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:05:58,638] [INFO] [timer.py:259:stop] epoch=0/micro_step=2040/global_step=2040, RunningAvgSamplesPerSec=2.6300894386558493, CurrSamplesPerSec=2.615852215015309, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:06:13,925] [INFO] [logging.py:96:log_dist] [Rank 0] step=2050, skipped=0, lr=[9.996870595694123e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:06:13,929] [INFO] [timer.py:259:stop] epoch=0/micro_step=2050/global_step=2050, RunningAvgSamplesPerSec=2.630091353535475, CurrSamplesPerSec=2.6326604009720107, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:06:29,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=2060, skipped=0, lr=[9.996810710514717e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:06:29,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=2060/global_step=2060, RunningAvgSamplesPerSec=2.630061075506092, CurrSamplesPerSec=2.6095282398374398, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:06:44,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=2070, skipped=0, lr=[9.996750257945843e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:06:44,556] [INFO] [timer.py:259:stop] epoch=0/micro_step=2070/global_step=2070, RunningAvgSamplesPerSec=2.6300489953827677, CurrSamplesPerSec=2.619373071411673, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:06:59,881] [INFO] [logging.py:96:log_dist] [Rank 0] step=2080, skipped=0, lr=[9.996689237994364e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:06:59,889] [INFO] [timer.py:259:stop] epoch=0/micro_step=2080/global_step=2080, RunningAvgSamplesPerSec=2.630017104918452, CurrSamplesPerSec=2.6297115958080832, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:07:15,232] [INFO] [logging.py:96:log_dist] [Rank 0] step=2090, skipped=0, lr=[9.99662765066721e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:07:15,238] [INFO] [timer.py:259:stop] epoch=0/micro_step=2090/global_step=2090, RunningAvgSamplesPerSec=2.629976062679916, CurrSamplesPerSec=2.63763499881412, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:07:30,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=2100, skipped=0, lr=[9.996565495971373e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:07:30,634] [INFO] [timer.py:259:stop] epoch=0/micro_step=2100/global_step=2100, RunningAvgSamplesPerSec=2.629883499788635, CurrSamplesPerSec=2.6453109577734044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:07:45,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=2110, skipped=0, lr=[9.996502773913913e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:07:45,905] [INFO] [timer.py:259:stop] epoch=0/micro_step=2110/global_step=2110, RunningAvgSamplesPerSec=2.6298974243425084, CurrSamplesPerSec=2.632940521984563, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:08:01,252] [INFO] [logging.py:96:log_dist] [Rank 0] step=2120, skipped=0, lr=[9.99643948450195e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:08:01,258] [INFO] [timer.py:259:stop] epoch=0/micro_step=2120/global_step=2120, RunningAvgSamplesPerSec=2.629853265569406, CurrSamplesPerSec=2.6281877923651162, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:08:16,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=2130, skipped=0, lr=[9.99637562774267e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:08:16,548] [INFO] [timer.py:259:stop] epoch=0/micro_step=2130/global_step=2130, RunningAvgSamplesPerSec=2.6298488141381133, CurrSamplesPerSec=2.622708655686754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:08:31,870] [INFO] [logging.py:96:log_dist] [Rank 0] step=2140, skipped=0, lr=[9.996311203643325e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:08:31,877] [INFO] [timer.py:259:stop] epoch=0/micro_step=2140/global_step=2140, RunningAvgSamplesPerSec=2.6298141487596602, CurrSamplesPerSec=2.6394865845474733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:08:47,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=2150, skipped=0, lr=[9.996246212211232e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:08:47,216] [INFO] [timer.py:259:stop] epoch=0/micro_step=2150/global_step=2150, RunningAvgSamplesPerSec=2.629781675816186, CurrSamplesPerSec=2.6166632835047925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:09:02,483] [INFO] [logging.py:96:log_dist] [Rank 0] step=2160, skipped=0, lr=[9.996180653453768e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:09:02,484] [INFO] [timer.py:259:stop] epoch=0/micro_step=2160/global_step=2160, RunningAvgSamplesPerSec=2.6297968610048574, CurrSamplesPerSec=2.6416227430644534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:09:17,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=2170, skipped=0, lr=[9.99611452737838e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:09:17,822] [INFO] [timer.py:259:stop] epoch=0/micro_step=2170/global_step=2170, RunningAvgSamplesPerSec=2.629758075544618, CurrSamplesPerSec=2.567292120971184, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:09:33,149] [INFO] [logging.py:96:log_dist] [Rank 0] step=2180, skipped=0, lr=[9.996047833992574e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:09:33,162] [INFO] [timer.py:259:stop] epoch=0/micro_step=2180/global_step=2180, RunningAvgSamplesPerSec=2.6297153583518322, CurrSamplesPerSec=2.6400194693677266, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:09:48,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=2190, skipped=0, lr=[9.995980573303924e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:09:48,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=2190/global_step=2190, RunningAvgSamplesPerSec=2.6296612191331215, CurrSamplesPerSec=2.6006687051790216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:10:03,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=2200, skipped=0, lr=[9.995912745320067e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:10:03,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=2200/global_step=2200, RunningAvgSamplesPerSec=2.6296295431524155, CurrSamplesPerSec=2.624097228791752, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:10:19,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=2210, skipped=0, lr=[9.995844350048707e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:10:19,159] [INFO] [timer.py:259:stop] epoch=0/micro_step=2210/global_step=2210, RunningAvgSamplesPerSec=2.629607777392159, CurrSamplesPerSec=2.5780758797829546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:10:34,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=2220, skipped=0, lr=[9.995775387497609e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:10:34,476] [INFO] [timer.py:259:stop] epoch=0/micro_step=2220/global_step=2220, RunningAvgSamplesPerSec=2.6295907506396072, CurrSamplesPerSec=2.6364690292684885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:10:49,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=2230, skipped=0, lr=[9.995705857674602e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:10:49,823] [INFO] [timer.py:259:stop] epoch=0/micro_step=2230/global_step=2230, RunningAvgSamplesPerSec=2.6295465525956407, CurrSamplesPerSec=2.6384683472528634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:11:05,147] [INFO] [logging.py:96:log_dist] [Rank 0] step=2240, skipped=0, lr=[9.995635760587582e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:11:05,155] [INFO] [timer.py:259:stop] epoch=0/micro_step=2240/global_step=2240, RunningAvgSamplesPerSec=2.629521951523305, CurrSamplesPerSec=2.6195738833862436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:11:20,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=2250, skipped=0, lr=[9.995565096244512e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:11:20,464] [INFO] [timer.py:259:stop] epoch=0/micro_step=2250/global_step=2250, RunningAvgSamplesPerSec=2.6295152038699237, CurrSamplesPerSec=2.6239855963620182, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:11:35,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=2260, skipped=0, lr=[9.995493864653411e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:11:35,806] [INFO] [timer.py:259:stop] epoch=0/micro_step=2260/global_step=2260, RunningAvgSamplesPerSec=2.629482355264985, CurrSamplesPerSec=2.6324579907344825, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:11:51,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=2270, skipped=0, lr=[9.99542206582237e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:11:51,127] [INFO] [timer.py:259:stop] epoch=0/micro_step=2270/global_step=2270, RunningAvgSamplesPerSec=2.6294581243421695, CurrSamplesPerSec=2.6308893346787405, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:12:06,383] [INFO] [logging.py:96:log_dist] [Rank 0] step=2280, skipped=0, lr=[9.995349699759542e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:12:06,384] [INFO] [timer.py:259:stop] epoch=0/micro_step=2280/global_step=2280, RunningAvgSamplesPerSec=2.629477392309275, CurrSamplesPerSec=2.6330115946120265, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:12:21,666] [INFO] [logging.py:96:log_dist] [Rank 0] step=2290, skipped=0, lr=[9.995276766473145e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:12:21,670] [INFO] [timer.py:259:stop] epoch=0/micro_step=2290/global_step=2290, RunningAvgSamplesPerSec=2.6294827378250396, CurrSamplesPerSec=2.5973756416636062, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:12:37,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=2300, skipped=0, lr=[9.995203265971456e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:12:37,052] [INFO] [timer.py:259:stop] epoch=0/micro_step=2300/global_step=2300, RunningAvgSamplesPerSec=2.629424330866521, CurrSamplesPerSec=2.634779313651999, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:12:52,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=2310, skipped=0, lr=[9.995129198262826e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:12:52,343] [INFO] [timer.py:259:stop] epoch=0/micro_step=2310/global_step=2310, RunningAvgSamplesPerSec=2.629422191624685, CurrSamplesPerSec=2.624589427630622, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:13:07,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=2320, skipped=0, lr=[9.995054563355665e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:13:07,720] [INFO] [timer.py:259:stop] epoch=0/micro_step=2320/global_step=2320, RunningAvgSamplesPerSec=2.6293582738392733, CurrSamplesPerSec=2.627335826036199, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:13:23,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=2330, skipped=0, lr=[9.994979361258442e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:13:23,024] [INFO] [timer.py:259:stop] epoch=0/micro_step=2330/global_step=2330, RunningAvgSamplesPerSec=2.6293478115446303, CurrSamplesPerSec=2.6345298284555274, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:13:38,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=2340, skipped=0, lr=[9.994903591979704e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:13:38,356] [INFO] [timer.py:259:stop] epoch=0/micro_step=2340/global_step=2340, RunningAvgSamplesPerSec=2.6293122159854376, CurrSamplesPerSec=2.612359115494651, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:13:53,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=2350, skipped=0, lr=[9.99482725552805e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:13:53,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=2350/global_step=2350, RunningAvgSamplesPerSec=2.6292947049113162, CurrSamplesPerSec=2.6116145371449178, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:14:08,992] [INFO] [logging.py:96:log_dist] [Rank 0] step=2360, skipped=0, lr=[9.99475035191215e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:14:09,008] [INFO] [timer.py:259:stop] epoch=0/micro_step=2360/global_step=2360, RunningAvgSamplesPerSec=2.6292768833579094, CurrSamplesPerSec=2.59276475335059, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:14:24,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=2370, skipped=0, lr=[9.994672881140734e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:14:24,317] [INFO] [timer.py:259:stop] epoch=0/micro_step=2370/global_step=2370, RunningAvgSamplesPerSec=2.6292660632126177, CurrSamplesPerSec=2.6508389662368614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:14:39,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=2380, skipped=0, lr=[9.994594843222603e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:14:39,601] [INFO] [timer.py:259:stop] epoch=0/micro_step=2380/global_step=2380, RunningAvgSamplesPerSec=2.6292675451900482, CurrSamplesPerSec=2.615534126270706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:14:54,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=2390, skipped=0, lr=[9.994516238166613e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:14:54,919] [INFO] [timer.py:259:stop] epoch=0/micro_step=2390/global_step=2390, RunningAvgSamplesPerSec=2.6292579194718844, CurrSamplesPerSec=2.6355607603707996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:15:10,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=2400, skipped=0, lr=[9.994437065981693e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:15:10,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=2400/global_step=2400, RunningAvgSamplesPerSec=2.629225177689363, CurrSamplesPerSec=2.6253599108109027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:15:25,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=2410, skipped=0, lr=[9.994357326676833e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:15:25,643] [INFO] [timer.py:259:stop] epoch=0/micro_step=2410/global_step=2410, RunningAvgSamplesPerSec=2.6291609941330005, CurrSamplesPerSec=2.5159844029767577, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:15:40,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=2420, skipped=0, lr=[9.994277020261088e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:15:40,922] [INFO] [timer.py:259:stop] epoch=0/micro_step=2420/global_step=2420, RunningAvgSamplesPerSec=2.629171070465038, CurrSamplesPerSec=2.631440215252502, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:15:56,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=2430, skipped=0, lr=[9.994196146743573e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:15:56,279] [INFO] [timer.py:259:stop] epoch=0/micro_step=2430/global_step=2430, RunningAvgSamplesPerSec=2.6291201740020336, CurrSamplesPerSec=2.6419393047807396, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:16:11,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=2440, skipped=0, lr=[9.994114706133477e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:16:11,615] [INFO] [timer.py:259:stop] epoch=0/micro_step=2440/global_step=2440, RunningAvgSamplesPerSec=2.629088312466965, CurrSamplesPerSec=2.6324645995568, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:16:26,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=2450, skipped=0, lr=[9.994032698440041e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:16:26,884] [INFO] [timer.py:259:stop] epoch=0/micro_step=2450/global_step=2450, RunningAvgSamplesPerSec=2.6290989580051205, CurrSamplesPerSec=2.6218303273234094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:16:42,169] [INFO] [logging.py:96:log_dist] [Rank 0] step=2460, skipped=0, lr=[9.993950123672583e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:16:42,171] [INFO] [timer.py:259:stop] epoch=0/micro_step=2460/global_step=2460, RunningAvgSamplesPerSec=2.629099620157007, CurrSamplesPerSec=2.6495437125167594, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:16:57,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=2470, skipped=0, lr=[9.993866981840477e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:16:57,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=2470/global_step=2470, RunningAvgSamplesPerSec=2.6291100058256127, CurrSamplesPerSec=2.6337477525053448, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:17:12,763] [INFO] [logging.py:96:log_dist] [Rank 0] step=2480, skipped=0, lr=[9.993783272953161e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:17:12,770] [INFO] [timer.py:259:stop] epoch=0/micro_step=2480/global_step=2480, RunningAvgSamplesPerSec=2.6290999961976396, CurrSamplesPerSec=2.630966485331839, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:17:28,092] [INFO] [logging.py:96:log_dist] [Rank 0] step=2490, skipped=0, lr=[9.993698997020144e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:17:28,103] [INFO] [timer.py:259:stop] epoch=0/micro_step=2490/global_step=2490, RunningAvgSamplesPerSec=2.6290745011744088, CurrSamplesPerSec=2.6159737616548786, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:17:43,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=2500, skipped=0, lr=[9.993614154050997e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:17:43,520] [INFO] [timer.py:259:stop] epoch=0/micro_step=2500/global_step=2500, RunningAvgSamplesPerSec=2.6289997018640103, CurrSamplesPerSec=2.6309359545858677, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:17:58,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=2510, skipped=0, lr=[9.993528744055349e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:17:58,858] [INFO] [timer.py:259:stop] epoch=0/micro_step=2510/global_step=2510, RunningAvgSamplesPerSec=2.6289733635302435, CurrSamplesPerSec=2.6031737759783264, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:18:14,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=2520, skipped=0, lr=[9.9934427670429e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:18:14,162] [INFO] [timer.py:259:stop] epoch=0/micro_step=2520/global_step=2520, RunningAvgSamplesPerSec=2.628957556106585, CurrSamplesPerSec=2.6470664177881473, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:18:29,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=2530, skipped=0, lr=[9.993356223023414e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:18:29,451] [INFO] [timer.py:259:stop] epoch=0/micro_step=2530/global_step=2530, RunningAvgSamplesPerSec=2.62895672172923, CurrSamplesPerSec=2.6089816258743888, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:18:44,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=2540, skipped=0, lr=[9.993269112006719e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:18:44,722] [INFO] [timer.py:259:stop] epoch=0/micro_step=2540/global_step=2540, RunningAvgSamplesPerSec=2.6289689373560607, CurrSamplesPerSec=2.635644810049486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:19:00,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=2550, skipped=0, lr=[9.993181434002702e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:19:00,016] [INFO] [timer.py:259:stop] epoch=0/micro_step=2550/global_step=2550, RunningAvgSamplesPerSec=2.6289590619985987, CurrSamplesPerSec=2.6260741204468157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:19:15,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=2560, skipped=0, lr=[9.993093189021323e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:19:15,339] [INFO] [timer.py:259:stop] epoch=0/micro_step=2560/global_step=2560, RunningAvgSamplesPerSec=2.6289425809479443, CurrSamplesPerSec=2.62399831869614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:19:30,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=2570, skipped=0, lr=[9.993004377072602e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:19:30,638] [INFO] [timer.py:259:stop] epoch=0/micro_step=2570/global_step=2570, RunningAvgSamplesPerSec=2.628937038151284, CurrSamplesPerSec=2.6515386114655946, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:19:46,035] [INFO] [logging.py:96:log_dist] [Rank 0] step=2580, skipped=0, lr=[9.992914998166622e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:19:46,037] [INFO] [timer.py:259:stop] epoch=0/micro_step=2580/global_step=2580, RunningAvgSamplesPerSec=2.628858968169414, CurrSamplesPerSec=2.626302272707543, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:20:01,326] [INFO] [logging.py:96:log_dist] [Rank 0] step=2590, skipped=0, lr=[9.992825052313533e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:20:01,328] [INFO] [timer.py:259:stop] epoch=0/micro_step=2590/global_step=2590, RunningAvgSamplesPerSec=2.6288565221097353, CurrSamplesPerSec=2.6321271789750007, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:20:16,656] [INFO] [logging.py:96:log_dist] [Rank 0] step=2600, skipped=0, lr=[9.992734539523548e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:20:16,657] [INFO] [timer.py:259:stop] epoch=0/micro_step=2600/global_step=2600, RunningAvgSamplesPerSec=2.6288335010983466, CurrSamplesPerSec=2.5517003569618857, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:20:31,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=2610, skipped=0, lr=[9.992643459806944e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:20:31,964] [INFO] [timer.py:259:stop] epoch=0/micro_step=2610/global_step=2610, RunningAvgSamplesPerSec=2.628837901580556, CurrSamplesPerSec=2.6331198637792355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:20:47,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=2620, skipped=0, lr=[9.992551813174065e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:20:47,287] [INFO] [timer.py:259:stop] epoch=0/micro_step=2620/global_step=2620, RunningAvgSamplesPerSec=2.628815433272638, CurrSamplesPerSec=2.559042079551201, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:21:02,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=2630, skipped=0, lr=[9.992459599635315e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:21:02,599] [INFO] [timer.py:259:stop] epoch=0/micro_step=2630/global_step=2630, RunningAvgSamplesPerSec=2.628812884382587, CurrSamplesPerSec=2.637797562048117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:21:18,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=2640, skipped=0, lr=[9.992366819201167e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:21:18,010] [INFO] [timer.py:259:stop] epoch=0/micro_step=2640/global_step=2640, RunningAvgSamplesPerSec=2.6287292047251953, CurrSamplesPerSec=2.5240164382524335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:21:33,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=2650, skipped=0, lr=[9.992273471882157e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:21:33,298] [INFO] [timer.py:259:stop] epoch=0/micro_step=2650/global_step=2650, RunningAvgSamplesPerSec=2.628737554420896, CurrSamplesPerSec=2.647345017922293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:21:48,613] [INFO] [logging.py:96:log_dist] [Rank 0] step=2660, skipped=0, lr=[9.992179557688881e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:21:48,637] [INFO] [timer.py:259:stop] epoch=0/micro_step=2660/global_step=2660, RunningAvgSamplesPerSec=2.6287036050841186, CurrSamplesPerSec=2.568943560351099, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:22:03,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=2670, skipped=0, lr=[9.992085076632007e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:22:03,924] [INFO] [timer.py:259:stop] epoch=0/micro_step=2670/global_step=2670, RunningAvgSamplesPerSec=2.6287074791197917, CurrSamplesPerSec=2.6345050066423377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:22:19,198] [INFO] [logging.py:96:log_dist] [Rank 0] step=2680, skipped=0, lr=[9.99199002872226e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:22:19,212] [INFO] [timer.py:259:stop] epoch=0/micro_step=2680/global_step=2680, RunningAvgSamplesPerSec=2.6287083748220317, CurrSamplesPerSec=2.6358435695316063, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:22:34,481] [INFO] [logging.py:96:log_dist] [Rank 0] step=2690, skipped=0, lr=[9.991894413970436e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:22:34,498] [INFO] [timer.py:259:stop] epoch=0/micro_step=2690/global_step=2690, RunningAvgSamplesPerSec=2.6287109723451514, CurrSamplesPerSec=2.6216877518258497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:22:49,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=2700, skipped=0, lr=[9.991798232387388e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:22:49,786] [INFO] [timer.py:259:stop] epoch=0/micro_step=2700/global_step=2700, RunningAvgSamplesPerSec=2.628708469346581, CurrSamplesPerSec=2.6159802879684646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:23:05,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=2710, skipped=0, lr=[9.991701483984042e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:23:05,103] [INFO] [timer.py:259:stop] epoch=0/micro_step=2710/global_step=2710, RunningAvgSamplesPerSec=2.628708208320105, CurrSamplesPerSec=2.6138686866253438, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:23:20,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=2720, skipped=0, lr=[9.991604168771381e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:23:20,454] [INFO] [timer.py:259:stop] epoch=0/micro_step=2720/global_step=2720, RunningAvgSamplesPerSec=2.628688636266455, CurrSamplesPerSec=2.5943729097416575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:23:35,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=2730, skipped=0, lr=[9.991506286760455e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:23:35,755] [INFO] [timer.py:259:stop] epoch=0/micro_step=2730/global_step=2730, RunningAvgSamplesPerSec=2.6286883490055475, CurrSamplesPerSec=2.628302253118746, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:23:51,035] [INFO] [logging.py:96:log_dist] [Rank 0] step=2740, skipped=0, lr=[9.99140783796238e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:23:51,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=2740/global_step=2740, RunningAvgSamplesPerSec=2.6286937821739134, CurrSamplesPerSec=2.63852726985583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:24:06,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=2750, skipped=0, lr=[9.991308822388333e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:24:06,341] [INFO] [timer.py:259:stop] epoch=0/micro_step=2750/global_step=2750, RunningAvgSamplesPerSec=2.628682735831544, CurrSamplesPerSec=2.6345207270697157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:24:21,676] [INFO] [logging.py:96:log_dist] [Rank 0] step=2760, skipped=0, lr=[9.99120924004956e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:24:21,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=2760/global_step=2760, RunningAvgSamplesPerSec=2.628664198624681, CurrSamplesPerSec=2.6381132075840026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:24:36,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=2770, skipped=0, lr=[9.991109090957367e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:24:36,942] [INFO] [timer.py:259:stop] epoch=0/micro_step=2770/global_step=2770, RunningAvgSamplesPerSec=2.6286798780927034, CurrSamplesPerSec=2.637223289253947, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:24:52,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=2780, skipped=0, lr=[9.991008375123127e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:24:52,237] [INFO] [timer.py:259:stop] epoch=0/micro_step=2780/global_step=2780, RunningAvgSamplesPerSec=2.6286832896992443, CurrSamplesPerSec=2.6181320742709238, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:25:07,483] [INFO] [logging.py:96:log_dist] [Rank 0] step=2790, skipped=0, lr=[9.990907092558274e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:25:07,491] [INFO] [timer.py:259:stop] epoch=0/micro_step=2790/global_step=2790, RunningAvgSamplesPerSec=2.6287019740929685, CurrSamplesPerSec=2.6425289523935165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:25:22,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=2800, skipped=0, lr=[9.990805243274308e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:25:22,723] [INFO] [timer.py:259:stop] epoch=0/micro_step=2800/global_step=2800, RunningAvgSamplesPerSec=2.6287417955400896, CurrSamplesPerSec=2.6537598113847247, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:25:37,987] [INFO] [logging.py:96:log_dist] [Rank 0] step=2810, skipped=0, lr=[9.990702827282798e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:25:38,007] [INFO] [timer.py:259:stop] epoch=0/micro_step=2810/global_step=2810, RunningAvgSamplesPerSec=2.628744203144131, CurrSamplesPerSec=2.62170086156904, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:25:53,279] [INFO] [logging.py:96:log_dist] [Rank 0] step=2820, skipped=0, lr=[9.99059984459537e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:25:53,299] [INFO] [timer.py:259:stop] epoch=0/micro_step=2820/global_step=2820, RunningAvgSamplesPerSec=2.6287443639550925, CurrSamplesPerSec=2.6452613245617083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:26:08,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=2830, skipped=0, lr=[9.990496295223721e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:26:08,586] [INFO] [timer.py:259:stop] epoch=0/micro_step=2830/global_step=2830, RunningAvgSamplesPerSec=2.6287422661234574, CurrSamplesPerSec=2.553869717292811, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:26:23,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=2840, skipped=0, lr=[9.990392179179606e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:26:23,866] [INFO] [timer.py:259:stop] epoch=0/micro_step=2840/global_step=2840, RunningAvgSamplesPerSec=2.6287497938972315, CurrSamplesPerSec=2.632047482703754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:26:39,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=2850, skipped=0, lr=[9.990287496474851e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:26:39,210] [INFO] [timer.py:259:stop] epoch=0/micro_step=2850/global_step=2850, RunningAvgSamplesPerSec=2.62871783460528, CurrSamplesPerSec=2.6149373069791637, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:26:54,485] [INFO] [logging.py:96:log_dist] [Rank 0] step=2860, skipped=0, lr=[9.990182247121338e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:26:54,486] [INFO] [timer.py:259:stop] epoch=0/micro_step=2860/global_step=2860, RunningAvgSamplesPerSec=2.6287277694818627, CurrSamplesPerSec=2.627369564919889, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:27:09,732] [INFO] [logging.py:96:log_dist] [Rank 0] step=2870, skipped=0, lr=[9.990076431131022e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:27:09,749] [INFO] [timer.py:259:stop] epoch=0/micro_step=2870/global_step=2870, RunningAvgSamplesPerSec=2.6287425576594314, CurrSamplesPerSec=2.6260083541830546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:27:24,999] [INFO] [logging.py:96:log_dist] [Rank 0] step=2880, skipped=0, lr=[9.989970048515915e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:27:25,012] [INFO] [timer.py:259:stop] epoch=0/micro_step=2880/global_step=2880, RunningAvgSamplesPerSec=2.6287607463064666, CurrSamplesPerSec=2.646684743106485, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:27:40,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=2890, skipped=0, lr=[9.9898630992881e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:27:40,374] [INFO] [timer.py:259:stop] epoch=0/micro_step=2890/global_step=2890, RunningAvgSamplesPerSec=2.6287140880189392, CurrSamplesPerSec=2.6346969741761304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:27:55,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=2900, skipped=0, lr=[9.989755583459719e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:27:55,610] [INFO] [timer.py:259:stop] epoch=0/micro_step=2900/global_step=2900, RunningAvgSamplesPerSec=2.628741356187722, CurrSamplesPerSec=2.6292265388982883, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:28:10,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=2910, skipped=0, lr=[9.98964750104298e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:28:10,907] [INFO] [timer.py:259:stop] epoch=0/micro_step=2910/global_step=2910, RunningAvgSamplesPerSec=2.6287393173509224, CurrSamplesPerSec=2.641509198676538, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:28:26,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=2920, skipped=0, lr=[9.98953885205016e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:28:26,211] [INFO] [timer.py:259:stop] epoch=0/micro_step=2920/global_step=2920, RunningAvgSamplesPerSec=2.6287425313843515, CurrSamplesPerSec=2.6463144485005334, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:28:41,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=2930, skipped=0, lr=[9.989429636493591e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:28:41,481] [INFO] [timer.py:259:stop] epoch=0/micro_step=2930/global_step=2930, RunningAvgSamplesPerSec=2.6287493564595583, CurrSamplesPerSec=2.6138475104530707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:28:56,772] [INFO] [logging.py:96:log_dist] [Rank 0] step=2940, skipped=0, lr=[9.989319854385677e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:28:56,782] [INFO] [timer.py:259:stop] epoch=0/micro_step=2940/global_step=2940, RunningAvgSamplesPerSec=2.6287447898319556, CurrSamplesPerSec=2.6342158669800595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:29:12,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=2950, skipped=0, lr=[9.989209505738884e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:29:12,113] [INFO] [timer.py:259:stop] epoch=0/micro_step=2950/global_step=2950, RunningAvgSamplesPerSec=2.6287282723008607, CurrSamplesPerSec=2.6117324377034143, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:29:27,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=2960, skipped=0, lr=[9.989098590565742e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:29:27,460] [INFO] [timer.py:259:stop] epoch=0/micro_step=2960/global_step=2960, RunningAvgSamplesPerSec=2.628697870756917, CurrSamplesPerSec=2.6248633162624064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:29:42,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=2970, skipped=0, lr=[9.988987108878843e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:29:42,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=2970/global_step=2970, RunningAvgSamplesPerSec=2.6286810692278597, CurrSamplesPerSec=2.6115897387272704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:29:58,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=2980, skipped=0, lr=[9.988875060690851e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:29:58,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=2980/global_step=2980, RunningAvgSamplesPerSec=2.6286681356298094, CurrSamplesPerSec=2.6013033940326844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:30:13,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=2990, skipped=0, lr=[9.988762446014483e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:30:13,362] [INFO] [timer.py:259:stop] epoch=0/micro_step=2990/global_step=2990, RunningAvgSamplesPerSec=2.6286793064094605, CurrSamplesPerSec=2.613857691262299, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:30:28,596] [INFO] [logging.py:96:log_dist] [Rank 0] step=3000, skipped=0, lr=[9.98864926486253e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:30:28,615] [INFO] [timer.py:259:stop] epoch=0/micro_step=3000/global_step=3000, RunningAvgSamplesPerSec=2.6286989741409283, CurrSamplesPerSec=2.6041807132599435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:30:43,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=3010, skipped=0, lr=[9.988535517247844e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:30:43,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=3010/global_step=3010, RunningAvgSamplesPerSec=2.628694246435546, CurrSamplesPerSec=2.6363666986618934, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:30:59,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=3020, skipped=0, lr=[9.988421203183338e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:30:59,251] [INFO] [timer.py:259:stop] epoch=0/micro_step=3020/global_step=3020, RunningAvgSamplesPerSec=2.6286890721566993, CurrSamplesPerSec=2.623634754796008, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:31:14,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=3030, skipped=0, lr=[9.988306322681997e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:31:14,523] [INFO] [timer.py:259:stop] epoch=0/micro_step=3030/global_step=3030, RunningAvgSamplesPerSec=2.6287030331505887, CurrSamplesPerSec=2.637852307238906, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:31:29,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=3040, skipped=0, lr=[9.988190875756862e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:31:29,795] [INFO] [timer.py:259:stop] epoch=0/micro_step=3040/global_step=3040, RunningAvgSamplesPerSec=2.6287133774340328, CurrSamplesPerSec=2.6369687818823095, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:31:45,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=3050, skipped=0, lr=[9.988074862421045e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:31:45,054] [INFO] [timer.py:259:stop] epoch=0/micro_step=3050/global_step=3050, RunningAvgSamplesPerSec=2.6287417593043894, CurrSamplesPerSec=2.6326228081457983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:32:00,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=3060, skipped=0, lr=[9.987958282687715e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:32:00,338] [INFO] [timer.py:259:stop] epoch=0/micro_step=3060/global_step=3060, RunningAvgSamplesPerSec=2.6287446753069657, CurrSamplesPerSec=2.6265658276304467, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:32:15,605] [INFO] [logging.py:96:log_dist] [Rank 0] step=3070, skipped=0, lr=[9.987841136570114e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:32:15,606] [INFO] [timer.py:259:stop] epoch=0/micro_step=3070/global_step=3070, RunningAvgSamplesPerSec=2.628755615803973, CurrSamplesPerSec=2.628245021495732, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:32:30,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=3080, skipped=0, lr=[9.987723424081542e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:32:30,929] [INFO] [timer.py:259:stop] epoch=0/micro_step=3080/global_step=3080, RunningAvgSamplesPerSec=2.6287333512389317, CurrSamplesPerSec=2.653960892714852, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:32:46,201] [INFO] [logging.py:96:log_dist] [Rank 0] step=3090, skipped=0, lr=[9.987605145235364e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:32:46,203] [INFO] [timer.py:259:stop] epoch=0/micro_step=3090/global_step=3090, RunningAvgSamplesPerSec=2.628739523044994, CurrSamplesPerSec=2.634771451871771, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:33:01,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=3100, skipped=0, lr=[9.987486300045013e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:33:01,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=3100/global_step=3100, RunningAvgSamplesPerSec=2.6287373859612497, CurrSamplesPerSec=2.6527431172282365, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:33:16,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=3110, skipped=0, lr=[9.987366888523984e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:33:16,800] [INFO] [timer.py:259:stop] epoch=0/micro_step=3110/global_step=3110, RunningAvgSamplesPerSec=2.628733067516186, CurrSamplesPerSec=2.6406697722362726, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:33:32,048] [INFO] [logging.py:96:log_dist] [Rank 0] step=3120, skipped=0, lr=[9.987246910685836e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:33:32,064] [INFO] [timer.py:259:stop] epoch=0/micro_step=3120/global_step=3120, RunningAvgSamplesPerSec=2.6287458309848, CurrSamplesPerSec=2.6544765389856053, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:33:47,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=3130, skipped=0, lr=[9.987126366544191e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:33:47,333] [INFO] [timer.py:259:stop] epoch=0/micro_step=3130/global_step=3130, RunningAvgSamplesPerSec=2.628753295614667, CurrSamplesPerSec=2.6363973555810305, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:34:02,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=3140, skipped=0, lr=[9.987005256112737e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:34:02,639] [INFO] [timer.py:259:stop] epoch=0/micro_step=3140/global_step=3140, RunningAvgSamplesPerSec=2.6287467369771425, CurrSamplesPerSec=2.621797959686084, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:34:17,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=3150, skipped=0, lr=[9.986883579405226e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:34:17,911] [INFO] [timer.py:259:stop] epoch=0/micro_step=3150/global_step=3150, RunningAvgSamplesPerSec=2.6287569641417314, CurrSamplesPerSec=2.5953999490894497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:34:33,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=3160, skipped=0, lr=[9.986761336435478e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:34:33,201] [INFO] [timer.py:259:stop] epoch=0/micro_step=3160/global_step=3160, RunningAvgSamplesPerSec=2.628768859227436, CurrSamplesPerSec=2.651159416570556, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:34:48,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=3170, skipped=0, lr=[9.98663852721737e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:34:48,463] [INFO] [timer.py:259:stop] epoch=0/micro_step=3170/global_step=3170, RunningAvgSamplesPerSec=2.628780255595788, CurrSamplesPerSec=2.621880314358766, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:35:03,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=3180, skipped=0, lr=[9.986515151764846e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:35:03,725] [INFO] [timer.py:259:stop] epoch=0/micro_step=3180/global_step=3180, RunningAvgSamplesPerSec=2.628793255681611, CurrSamplesPerSec=2.606249153348128, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:35:19,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=3190, skipped=0, lr=[9.98639121009192e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:35:19,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=3190/global_step=3190, RunningAvgSamplesPerSec=2.6287974897317787, CurrSamplesPerSec=2.599710806979382, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:35:34,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=3200, skipped=0, lr=[9.986266702212661e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:35:34,288] [INFO] [timer.py:259:stop] epoch=0/micro_step=3200/global_step=3200, RunningAvgSamplesPerSec=2.6288038459665484, CurrSamplesPerSec=2.633488954981615, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:35:49,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=3210, skipped=0, lr=[9.98614162814121e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:35:49,537] [INFO] [timer.py:259:stop] epoch=0/micro_step=3210/global_step=3210, RunningAvgSamplesPerSec=2.628826863409781, CurrSamplesPerSec=2.634064497278163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:36:04,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=3220, skipped=0, lr=[9.986015987891767e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:36:04,813] [INFO] [timer.py:259:stop] epoch=0/micro_step=3220/global_step=3220, RunningAvgSamplesPerSec=2.6288368804544264, CurrSamplesPerSec=2.640673097291217, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:36:20,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=3230, skipped=0, lr=[9.985889781478601e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:36:20,152] [INFO] [timer.py:259:stop] epoch=0/micro_step=3230/global_step=3230, RunningAvgSamplesPerSec=2.6288167922622425, CurrSamplesPerSec=2.6337986084771208, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:36:35,403] [INFO] [logging.py:96:log_dist] [Rank 0] step=3240, skipped=0, lr=[9.985763008916039e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:36:35,405] [INFO] [timer.py:259:stop] epoch=0/micro_step=3240/global_step=3240, RunningAvgSamplesPerSec=2.628835953767806, CurrSamplesPerSec=2.6324166863467884, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:36:50,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=3250, skipped=0, lr=[9.985635670218481e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:36:50,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=3250/global_step=3250, RunningAvgSamplesPerSec=2.628817312451261, CurrSamplesPerSec=2.6323026929620124, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:37:06,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=3260, skipped=0, lr=[9.985507765400381e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:37:06,122] [INFO] [timer.py:259:stop] epoch=0/micro_step=3260/global_step=3260, RunningAvgSamplesPerSec=2.628770778884752, CurrSamplesPerSec=2.630310642786567, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:37:21,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=3270, skipped=0, lr=[9.985379294476268e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:37:21,437] [INFO] [timer.py:259:stop] epoch=0/micro_step=3270/global_step=3270, RunningAvgSamplesPerSec=2.628755729713438, CurrSamplesPerSec=2.6350983760026536, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:37:36,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=3280, skipped=0, lr=[9.985250257460725e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:37:36,724] [INFO] [timer.py:259:stop] epoch=0/micro_step=3280/global_step=3280, RunningAvgSamplesPerSec=2.6287551060207868, CurrSamplesPerSec=2.6440644354572895, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:37:52,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=3290, skipped=0, lr=[9.985120654368407e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:37:52,007] [INFO] [timer.py:259:stop] epoch=0/micro_step=3290/global_step=3290, RunningAvgSamplesPerSec=2.6287609974414305, CurrSamplesPerSec=2.6434070453251652, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:38:07,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=3300, skipped=0, lr=[9.984990485214031e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:38:07,288] [INFO] [timer.py:259:stop] epoch=0/micro_step=3300/global_step=3300, RunningAvgSamplesPerSec=2.6287632750185463, CurrSamplesPerSec=2.626276372348202, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:38:22,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=3310, skipped=0, lr=[9.984859750012377e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:38:22,577] [INFO] [timer.py:259:stop] epoch=0/micro_step=3310/global_step=3310, RunningAvgSamplesPerSec=2.628767679102957, CurrSamplesPerSec=2.6039248644619937, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:38:37,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=3320, skipped=0, lr=[9.984728448778289e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:38:37,785] [INFO] [timer.py:259:stop] epoch=0/micro_step=3320/global_step=3320, RunningAvgSamplesPerSec=2.6288056130761284, CurrSamplesPerSec=2.6394977965872806, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:38:53,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=3330, skipped=0, lr=[9.984596581526675e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:38:53,047] [INFO] [timer.py:259:stop] epoch=0/micro_step=3330/global_step=3330, RunningAvgSamplesPerSec=2.6288174154583044, CurrSamplesPerSec=2.6386139987560826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:39:08,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=3340, skipped=0, lr=[9.984464148272513e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:39:08,322] [INFO] [timer.py:259:stop] epoch=0/micro_step=3340/global_step=3340, RunningAvgSamplesPerSec=2.6288254268203137, CurrSamplesPerSec=2.606671092659757, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:39:23,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=3350, skipped=0, lr=[9.984331149030838e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:39:23,590] [INFO] [timer.py:259:stop] epoch=0/micro_step=3350/global_step=3350, RunningAvgSamplesPerSec=2.6288332234176672, CurrSamplesPerSec=2.6386447079631377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:39:38,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=3360, skipped=0, lr=[9.984197583816753e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:39:38,879] [INFO] [timer.py:259:stop] epoch=0/micro_step=3360/global_step=3360, RunningAvgSamplesPerSec=2.6288284293807145, CurrSamplesPerSec=2.648814590621503, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:39:54,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=3370, skipped=0, lr=[9.984063452645422e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:39:54,118] [INFO] [timer.py:259:stop] epoch=0/micro_step=3370/global_step=3370, RunningAvgSamplesPerSec=2.6288539585674253, CurrSamplesPerSec=2.6440165157515323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:40:09,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=3380, skipped=0, lr=[9.98392875553208e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:40:09,351] [INFO] [timer.py:259:stop] epoch=0/micro_step=3380/global_step=3380, RunningAvgSamplesPerSec=2.628875564386278, CurrSamplesPerSec=2.6360510567173008, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:40:24,628] [INFO] [logging.py:96:log_dist] [Rank 0] step=3390, skipped=0, lr=[9.983793492492018e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:40:24,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=3390/global_step=3390, RunningAvgSamplesPerSec=2.6288801790111775, CurrSamplesPerSec=2.6391357373597955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:40:39,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=3400, skipped=0, lr=[9.983657663540596e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:40:39,947] [INFO] [timer.py:259:stop] epoch=0/micro_step=3400/global_step=3400, RunningAvgSamplesPerSec=2.628867312930915, CurrSamplesPerSec=2.6219696402008736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:40:55,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=3410, skipped=0, lr=[9.98352126869324e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:40:55,232] [INFO] [timer.py:259:stop] epoch=0/micro_step=3410/global_step=3410, RunningAvgSamplesPerSec=2.6288709146665483, CurrSamplesPerSec=2.6339764130823045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:41:10,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=3420, skipped=0, lr=[9.983384307965433e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:41:10,507] [INFO] [timer.py:259:stop] epoch=0/micro_step=3420/global_step=3420, RunningAvgSamplesPerSec=2.628877751947062, CurrSamplesPerSec=2.636172002440708, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:41:25,799] [INFO] [logging.py:96:log_dist] [Rank 0] step=3430, skipped=0, lr=[9.98324678137273e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:41:25,800] [INFO] [timer.py:259:stop] epoch=0/micro_step=3430/global_step=3430, RunningAvgSamplesPerSec=2.628877105818184, CurrSamplesPerSec=2.6354452525636547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:41:41,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=3440, skipped=0, lr=[9.983108688930749e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:41:41,091] [INFO] [timer.py:259:stop] epoch=0/micro_step=3440/global_step=3440, RunningAvgSamplesPerSec=2.6288762234570693, CurrSamplesPerSec=2.656108358695492, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:41:56,365] [INFO] [logging.py:96:log_dist] [Rank 0] step=3450, skipped=0, lr=[9.982970030655166e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:41:56,366] [INFO] [timer.py:259:stop] epoch=0/micro_step=3450/global_step=3450, RunningAvgSamplesPerSec=2.628883753467649, CurrSamplesPerSec=2.63477558964499, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:42:11,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=3460, skipped=0, lr=[9.982830806561729e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:42:11,660] [INFO] [timer.py:259:stop] epoch=0/micro_step=3460/global_step=3460, RunningAvgSamplesPerSec=2.628884595828827, CurrSamplesPerSec=2.6387961895107503, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:42:26,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=3470, skipped=0, lr=[9.982691016666248e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:42:26,949] [INFO] [timer.py:259:stop] epoch=0/micro_step=3470/global_step=3470, RunningAvgSamplesPerSec=2.6288899645704658, CurrSamplesPerSec=2.624533589287533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:42:42,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=3480, skipped=0, lr=[9.982550660984591e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:42:42,198] [INFO] [timer.py:259:stop] epoch=0/micro_step=3480/global_step=3480, RunningAvgSamplesPerSec=2.628909688574926, CurrSamplesPerSec=2.6565806703446544, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:42:57,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=3490, skipped=0, lr=[9.982409739532698e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:42:57,505] [INFO] [timer.py:259:stop] epoch=0/micro_step=3490/global_step=3490, RunningAvgSamplesPerSec=2.628905589367885, CurrSamplesPerSec=2.6312128201080647, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:43:12,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=3500, skipped=0, lr=[9.982268252326575e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:43:12,788] [INFO] [timer.py:259:stop] epoch=0/micro_step=3500/global_step=3500, RunningAvgSamplesPerSec=2.6289082280044074, CurrSamplesPerSec=2.6433424904147933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:43:28,092] [INFO] [logging.py:96:log_dist] [Rank 0] step=3510, skipped=0, lr=[9.98212619938228e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:43:28,108] [INFO] [timer.py:259:stop] epoch=0/micro_step=3510/global_step=3510, RunningAvgSamplesPerSec=2.6289016583019635, CurrSamplesPerSec=2.5842624792408255, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:43:43,384] [INFO] [logging.py:96:log_dist] [Rank 0] step=3520, skipped=0, lr=[9.98198358071595e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:43:43,403] [INFO] [timer.py:259:stop] epoch=0/micro_step=3520/global_step=3520, RunningAvgSamplesPerSec=2.6289115538750703, CurrSamplesPerSec=2.6211765740871513, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:43:58,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=3530, skipped=0, lr=[9.981840396343776e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:43:58,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=3530/global_step=3530, RunningAvgSamplesPerSec=2.6288918674453154, CurrSamplesPerSec=2.5886534125077842, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:44:14,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=3540, skipped=0, lr=[9.981696646282017e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:44:14,071] [INFO] [timer.py:259:stop] epoch=0/micro_step=3540/global_step=3540, RunningAvgSamplesPerSec=2.6288931004875433, CurrSamplesPerSec=2.6117751284616078, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:44:29,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=3550, skipped=0, lr=[9.981552330546996e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:44:29,459] [INFO] [timer.py:259:stop] epoch=0/micro_step=3550/global_step=3550, RunningAvgSamplesPerSec=2.6288492326719854, CurrSamplesPerSec=2.630092513568075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:44:44,755] [INFO] [logging.py:96:log_dist] [Rank 0] step=3560, skipped=0, lr=[9.9814074491551e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:44:44,756] [INFO] [timer.py:259:stop] epoch=0/micro_step=3560/global_step=3560, RunningAvgSamplesPerSec=2.62884922310398, CurrSamplesPerSec=2.6255398648838675, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:45:00,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=3570, skipped=0, lr=[9.981262002122781e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:45:00,041] [INFO] [timer.py:259:stop] epoch=0/micro_step=3570/global_step=3570, RunningAvgSamplesPerSec=2.6288552398763136, CurrSamplesPerSec=2.6412858816034586, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:45:15,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=3580, skipped=0, lr=[9.981115989466555e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:45:15,323] [INFO] [timer.py:259:stop] epoch=0/micro_step=3580/global_step=3580, RunningAvgSamplesPerSec=2.6288598343894884, CurrSamplesPerSec=2.6296217416869045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:45:30,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=3590, skipped=0, lr=[9.980969411202998e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:45:30,665] [INFO] [timer.py:259:stop] epoch=0/micro_step=3590/global_step=3590, RunningAvgSamplesPerSec=2.628840399689325, CurrSamplesPerSec=2.631329195290082, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:45:45,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=3600, skipped=0, lr=[9.98082226734876e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:45:45,981] [INFO] [timer.py:259:stop] epoch=0/micro_step=3600/global_step=3600, RunningAvgSamplesPerSec=2.6288306701061406, CurrSamplesPerSec=2.6379286225085545, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:46:01,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=3610, skipped=0, lr=[9.980674557920546e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:46:01,342] [INFO] [timer.py:259:stop] epoch=0/micro_step=3610/global_step=3610, RunningAvgSamplesPerSec=2.6287960441956133, CurrSamplesPerSec=2.606959077849978, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:46:16,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=3620, skipped=0, lr=[9.980526282935128e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:46:16,603] [INFO] [timer.py:259:stop] epoch=0/micro_step=3620/global_step=3620, RunningAvgSamplesPerSec=2.628807658063568, CurrSamplesPerSec=2.627383554467257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:46:31,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=3630, skipped=0, lr=[9.980377442409343e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:46:31,874] [INFO] [timer.py:259:stop] epoch=0/micro_step=3630/global_step=3630, RunningAvgSamplesPerSec=2.6288173977902165, CurrSamplesPerSec=2.6428353234791606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:46:47,144] [INFO] [logging.py:96:log_dist] [Rank 0] step=3640, skipped=0, lr=[9.980228036360092e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:46:47,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=3640/global_step=3640, RunningAvgSamplesPerSec=2.628828964595605, CurrSamplesPerSec=2.641888133949042, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:47:02,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=3650, skipped=0, lr=[9.980078064804339e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:47:02,451] [INFO] [timer.py:259:stop] epoch=0/micro_step=3650/global_step=3650, RunningAvgSamplesPerSec=2.6288253449437486, CurrSamplesPerSec=2.6413507519456014, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:47:17,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=3660, skipped=0, lr=[9.979927527759117e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:47:17,779] [INFO] [timer.py:259:stop] epoch=0/micro_step=3660/global_step=3660, RunningAvgSamplesPerSec=2.6288025815055387, CurrSamplesPerSec=2.579184417980568, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:47:33,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=3670, skipped=0, lr=[9.979776425241517e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:47:33,072] [INFO] [timer.py:259:stop] epoch=0/micro_step=3670/global_step=3670, RunningAvgSamplesPerSec=2.628797930626298, CurrSamplesPerSec=2.6515746510993994, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:47:48,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=3680, skipped=0, lr=[9.979624757268696e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:47:48,386] [INFO] [timer.py:259:stop] epoch=0/micro_step=3680/global_step=3680, RunningAvgSamplesPerSec=2.628786693261991, CurrSamplesPerSec=2.593975396123142, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:48:03,605] [INFO] [logging.py:96:log_dist] [Rank 0] step=3690, skipped=0, lr=[9.979472523857875e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:48:03,608] [INFO] [timer.py:259:stop] epoch=0/micro_step=3690/global_step=3690, RunningAvgSamplesPerSec=2.628813251932562, CurrSamplesPerSec=2.6429190051067004, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:48:18,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=3700, skipped=0, lr=[9.979319725026345e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:48:18,894] [INFO] [timer.py:259:stop] epoch=0/micro_step=3700/global_step=3700, RunningAvgSamplesPerSec=2.6288216403516222, CurrSamplesPerSec=2.6166359405118995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:48:34,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=3710, skipped=0, lr=[9.979166360791452e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:48:34,169] [INFO] [timer.py:259:stop] epoch=0/micro_step=3710/global_step=3710, RunningAvgSamplesPerSec=2.628826121848076, CurrSamplesPerSec=2.634473980033586, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:48:49,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=3720, skipped=0, lr=[9.979012431170612e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:48:49,430] [INFO] [timer.py:259:stop] epoch=0/micro_step=3720/global_step=3720, RunningAvgSamplesPerSec=2.628836573384336, CurrSamplesPerSec=2.612199265790895, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:49:04,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=3730, skipped=0, lr=[9.978857936181304e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:49:04,671] [INFO] [timer.py:259:stop] epoch=0/micro_step=3730/global_step=3730, RunningAvgSamplesPerSec=2.6288572391876355, CurrSamplesPerSec=2.6433387421620127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:49:19,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=3740, skipped=0, lr=[9.97870287584107e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:49:19,994] [INFO] [timer.py:259:stop] epoch=0/micro_step=3740/global_step=3740, RunningAvgSamplesPerSec=2.628841814492737, CurrSamplesPerSec=2.626655884259893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:49:35,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=3750, skipped=0, lr=[9.978547250167518e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:49:35,293] [INFO] [timer.py:259:stop] epoch=0/micro_step=3750/global_step=3750, RunningAvgSamplesPerSec=2.628838573397043, CurrSamplesPerSec=2.640587479794625, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:49:50,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=3760, skipped=0, lr=[9.978391059178319e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:49:50,597] [INFO] [timer.py:259:stop] epoch=0/micro_step=3760/global_step=3760, RunningAvgSamplesPerSec=2.628836221396397, CurrSamplesPerSec=2.613593015994701, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:50:05,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=3770, skipped=0, lr=[9.97823430289121e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:50:05,864] [INFO] [timer.py:259:stop] epoch=0/micro_step=3770/global_step=3770, RunningAvgSamplesPerSec=2.6288453917998913, CurrSamplesPerSec=2.6127224090647525, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:50:21,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=3780, skipped=0, lr=[9.978076981323988e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:50:21,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=3780/global_step=3780, RunningAvgSamplesPerSec=2.628834003454509, CurrSamplesPerSec=2.6236991713027824, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:50:36,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=3790, skipped=0, lr=[9.977919094494518e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:50:36,515] [INFO] [timer.py:259:stop] epoch=0/micro_step=3790/global_step=3790, RunningAvgSamplesPerSec=2.628812854986199, CurrSamplesPerSec=2.640947443169019, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:50:51,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=3800, skipped=0, lr=[9.977760642420728e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:50:51,894] [INFO] [timer.py:259:stop] epoch=0/micro_step=3800/global_step=3800, RunningAvgSamplesPerSec=2.6287848459702086, CurrSamplesPerSec=2.6401187600735145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:51:07,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=3810, skipped=0, lr=[9.977601625120612e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:51:07,132] [INFO] [timer.py:259:stop] epoch=0/micro_step=3810/global_step=3810, RunningAvgSamplesPerSec=2.6288050937797203, CurrSamplesPerSec=2.6233861449502816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:51:22,403] [INFO] [logging.py:96:log_dist] [Rank 0] step=3820, skipped=0, lr=[9.977442042612225e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:51:22,404] [INFO] [timer.py:259:stop] epoch=0/micro_step=3820/global_step=3820, RunningAvgSamplesPerSec=2.628816256352617, CurrSamplesPerSec=2.6424835855404862, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:51:37,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=3830, skipped=0, lr=[9.977281894913687e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:51:37,688] [INFO] [timer.py:259:stop] epoch=0/micro_step=3830/global_step=3830, RunningAvgSamplesPerSec=2.628819345444782, CurrSamplesPerSec=2.6151044215341974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:51:53,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=3840, skipped=0, lr=[9.977121182043185e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:51:53,050] [INFO] [timer.py:259:stop] epoch=0/micro_step=3840/global_step=3840, RunningAvgSamplesPerSec=2.628793620998019, CurrSamplesPerSec=2.634643187347828, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:52:08,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=3850, skipped=0, lr=[9.976959904018967e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:52:08,316] [INFO] [timer.py:259:stop] epoch=0/micro_step=3850/global_step=3850, RunningAvgSamplesPerSec=2.628807926128132, CurrSamplesPerSec=2.6106603410259845, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:52:23,592] [INFO] [logging.py:96:log_dist] [Rank 0] step=3860, skipped=0, lr=[9.976798060859343e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:52:23,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=3860/global_step=3860, RunningAvgSamplesPerSec=2.6288142005150132, CurrSamplesPerSec=2.631661457683241, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:52:38,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=3870, skipped=0, lr=[9.976635652582693e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:52:38,899] [INFO] [timer.py:259:stop] epoch=0/micro_step=3870/global_step=3870, RunningAvgSamplesPerSec=2.6288053783755396, CurrSamplesPerSec=2.5949828546153046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:52:54,193] [INFO] [logging.py:96:log_dist] [Rank 0] step=3880, skipped=0, lr=[9.97647267920746e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:52:54,195] [INFO] [timer.py:259:stop] epoch=0/micro_step=3880/global_step=3880, RunningAvgSamplesPerSec=2.6288005149758167, CurrSamplesPerSec=2.633592302525102, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:53:09,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=3890, skipped=0, lr=[9.97630914075215e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:53:09,496] [INFO] [timer.py:259:stop] epoch=0/micro_step=3890/global_step=3890, RunningAvgSamplesPerSec=2.6288040014416465, CurrSamplesPerSec=2.629055966685353, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:53:24,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=3900, skipped=0, lr=[9.97614503723533e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:53:24,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=3900/global_step=3900, RunningAvgSamplesPerSec=2.6288217586218443, CurrSamplesPerSec=2.635482926040462, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:53:40,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=3910, skipped=0, lr=[9.975980368675633e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:53:40,056] [INFO] [timer.py:259:stop] epoch=0/micro_step=3910/global_step=3910, RunningAvgSamplesPerSec=2.628808051547453, CurrSamplesPerSec=2.6433320786277608, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:53:55,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=3920, skipped=0, lr=[9.975815135091761e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:53:55,293] [INFO] [timer.py:259:stop] epoch=0/micro_step=3920/global_step=3920, RunningAvgSamplesPerSec=2.6288264070401346, CurrSamplesPerSec=2.64053511454765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:54:10,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=3930, skipped=0, lr=[9.975649336502475e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:54:10,628] [INFO] [timer.py:259:stop] epoch=0/micro_step=3930/global_step=3930, RunningAvgSamplesPerSec=2.6288152031043612, CurrSamplesPerSec=2.6280325863338043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:54:25,891] [INFO] [logging.py:96:log_dist] [Rank 0] step=3940, skipped=0, lr=[9.9754829729266e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:54:25,901] [INFO] [timer.py:259:stop] epoch=0/micro_step=3940/global_step=3940, RunningAvgSamplesPerSec=2.6288261219006577, CurrSamplesPerSec=2.641518348412129, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:54:41,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=3950, skipped=0, lr=[9.97531604438303e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:54:41,209] [INFO] [timer.py:259:stop] epoch=0/micro_step=3950/global_step=3950, RunningAvgSamplesPerSec=2.6288152485914633, CurrSamplesPerSec=2.629468426814398, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:54:56,456] [INFO] [logging.py:96:log_dist] [Rank 0] step=3960, skipped=0, lr=[9.975148550890715e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:54:56,458] [INFO] [timer.py:259:stop] epoch=0/micro_step=3960/global_step=3960, RunningAvgSamplesPerSec=2.6288345938804722, CurrSamplesPerSec=2.640723805416681, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:55:11,794] [INFO] [logging.py:96:log_dist] [Rank 0] step=3970, skipped=0, lr=[9.974980492468678e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:55:11,804] [INFO] [timer.py:259:stop] epoch=0/micro_step=3970/global_step=3970, RunningAvgSamplesPerSec=2.6288075387062255, CurrSamplesPerSec=2.63099784197069, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:55:27,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=3980, skipped=0, lr=[9.974811869136e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:55:27,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=3980/global_step=3980, RunningAvgSamplesPerSec=2.628828766052313, CurrSamplesPerSec=2.634983736407026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:55:42,376] [INFO] [logging.py:96:log_dist] [Rank 0] step=3990, skipped=0, lr=[9.974642680911828e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:55:42,377] [INFO] [timer.py:259:stop] epoch=0/micro_step=3990/global_step=3990, RunningAvgSamplesPerSec=2.6288116165697417, CurrSamplesPerSec=2.6320784521840195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:55:57,675] [INFO] [logging.py:96:log_dist] [Rank 0] step=4000, skipped=0, lr=[9.974472927815376e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:55:57,677] [INFO] [timer.py:259:stop] epoch=0/micro_step=4000/global_step=4000, RunningAvgSamplesPerSec=2.628809710297524, CurrSamplesPerSec=2.630341159021293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:56:12,957] [INFO] [logging.py:96:log_dist] [Rank 0] step=4010, skipped=0, lr=[9.974302609865918e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:56:12,971] [INFO] [timer.py:259:stop] epoch=0/micro_step=4010/global_step=4010, RunningAvgSamplesPerSec=2.6288137625488694, CurrSamplesPerSec=2.622959597583797, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:56:28,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=4020, skipped=0, lr=[9.974131727082792e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:56:28,294] [INFO] [timer.py:259:stop] epoch=0/micro_step=4020/global_step=4020, RunningAvgSamplesPerSec=2.6288018045174817, CurrSamplesPerSec=2.5925776455598903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:56:43,561] [INFO] [logging.py:96:log_dist] [Rank 0] step=4030, skipped=0, lr=[9.973960279485403e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:56:43,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=4030/global_step=4030, RunningAvgSamplesPerSec=2.6288137957759576, CurrSamplesPerSec=2.635227926611749, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:56:58,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=4040, skipped=0, lr=[9.97378826709322e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:56:58,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=4040/global_step=4040, RunningAvgSamplesPerSec=2.6288112971637907, CurrSamplesPerSec=2.5867065155570548, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:57:14,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=4050, skipped=0, lr=[9.973615689925772e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:57:14,197] [INFO] [timer.py:259:stop] epoch=0/micro_step=4050/global_step=4050, RunningAvgSamplesPerSec=2.628791917674708, CurrSamplesPerSec=2.639361597516368, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:57:29,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=4060, skipped=0, lr=[9.973442548002661e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:57:29,464] [INFO] [timer.py:259:stop] epoch=0/micro_step=4060/global_step=4060, RunningAvgSamplesPerSec=2.6287989493196124, CurrSamplesPerSec=2.652842528382787, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:57:44,773] [INFO] [logging.py:96:log_dist] [Rank 0] step=4070, skipped=0, lr=[9.973268841343541e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:57:44,784] [INFO] [timer.py:259:stop] epoch=0/micro_step=4070/global_step=4070, RunningAvgSamplesPerSec=2.6287864761620456, CurrSamplesPerSec=2.638012823282701, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:58:00,039] [INFO] [logging.py:96:log_dist] [Rank 0] step=4080, skipped=0, lr=[9.97309456996814e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:58:00,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=4080/global_step=4080, RunningAvgSamplesPerSec=2.6287974060662482, CurrSamplesPerSec=2.638882105953232, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:58:15,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=4090, skipped=0, lr=[9.972919733896245e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:58:15,330] [INFO] [timer.py:259:stop] epoch=0/micro_step=4090/global_step=4090, RunningAvgSamplesPerSec=2.628796112076218, CurrSamplesPerSec=2.6395451373610683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:58:30,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=4100, skipped=0, lr=[9.97274433314771e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:58:30,649] [INFO] [timer.py:259:stop] epoch=0/micro_step=4100/global_step=4100, RunningAvgSamplesPerSec=2.628782219349233, CurrSamplesPerSec=2.6430589026465197, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:58:45,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=4110, skipped=0, lr=[9.972568367742452e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:58:45,950] [INFO] [timer.py:259:stop] epoch=0/micro_step=4110/global_step=4110, RunningAvgSamplesPerSec=2.6287798803436218, CurrSamplesPerSec=2.6294634814709963, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:59:01,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=4120, skipped=0, lr=[9.97239183770045e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:59:01,273] [INFO] [timer.py:259:stop] epoch=0/micro_step=4120/global_step=4120, RunningAvgSamplesPerSec=2.628765490040276, CurrSamplesPerSec=2.64643383318758, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:59:16,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=4130, skipped=0, lr=[9.972214743041751e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:59:16,524] [INFO] [timer.py:259:stop] epoch=0/micro_step=4130/global_step=4130, RunningAvgSamplesPerSec=2.628779006567829, CurrSamplesPerSec=2.6382160886789108, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:59:31,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=4140, skipped=0, lr=[9.972037083786463e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:59:31,860] [INFO] [timer.py:259:stop] epoch=0/micro_step=4140/global_step=4140, RunningAvgSamplesPerSec=2.628759884893128, CurrSamplesPerSec=2.636912415559198, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 01:59:47,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=4150, skipped=0, lr=[9.971858859954759e-06], mom=[(0.9, 0.95)] +[2024-11-01 01:59:47,131] [INFO] [timer.py:259:stop] epoch=0/micro_step=4150/global_step=4150, RunningAvgSamplesPerSec=2.6287639800233222, CurrSamplesPerSec=2.634491768533218, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:00:02,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=4160, skipped=0, lr=[9.971680071566876e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:00:02,420] [INFO] [timer.py:259:stop] epoch=0/micro_step=4160/global_step=4160, RunningAvgSamplesPerSec=2.628767106371065, CurrSamplesPerSec=2.645215863842424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:00:17,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=4170, skipped=0, lr=[9.97150071864312e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:00:17,760] [INFO] [timer.py:259:stop] epoch=0/micro_step=4170/global_step=4170, RunningAvgSamplesPerSec=2.6287456374350806, CurrSamplesPerSec=2.5961039745964523, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:00:33,039] [INFO] [logging.py:96:log_dist] [Rank 0] step=4180, skipped=0, lr=[9.97132080120385e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:00:33,054] [INFO] [timer.py:259:stop] epoch=0/micro_step=4180/global_step=4180, RunningAvgSamplesPerSec=2.6287437896070935, CurrSamplesPerSec=2.6174858809242574, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:00:48,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=4190, skipped=0, lr=[9.9711403192695e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:00:48,344] [INFO] [timer.py:259:stop] epoch=0/micro_step=4190/global_step=4190, RunningAvgSamplesPerSec=2.6287467840681793, CurrSamplesPerSec=2.6384056930058097, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:01:03,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=4200, skipped=0, lr=[9.97095927286056e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:01:03,616] [INFO] [timer.py:259:stop] epoch=0/micro_step=4200/global_step=4200, RunningAvgSamplesPerSec=2.6287565338416505, CurrSamplesPerSec=2.6296881012477393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:01:19,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=4210, skipped=0, lr=[9.970777661997592e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:01:19,022] [INFO] [timer.py:259:stop] epoch=0/micro_step=4210/global_step=4210, RunningAvgSamplesPerSec=2.628713989686257, CurrSamplesPerSec=2.573008267667669, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:01:34,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=4220, skipped=0, lr=[9.970595486701219e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:01:34,311] [INFO] [timer.py:259:stop] epoch=0/micro_step=4220/global_step=4220, RunningAvgSamplesPerSec=2.628720535466186, CurrSamplesPerSec=2.6386397280431644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:01:49,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=4230, skipped=0, lr=[9.97041274699212e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:01:49,667] [INFO] [timer.py:259:stop] epoch=0/micro_step=4230/global_step=4230, RunningAvgSamplesPerSec=2.6287010438530674, CurrSamplesPerSec=2.61763739320905, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:02:04,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=4240, skipped=0, lr=[9.970229442891052e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:02:04,922] [INFO] [timer.py:259:stop] epoch=0/micro_step=4240/global_step=4240, RunningAvgSamplesPerSec=2.6287143595106808, CurrSamplesPerSec=2.633646459876763, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:02:20,247] [INFO] [logging.py:96:log_dist] [Rank 0] step=4250, skipped=0, lr=[9.970045574418828e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:02:20,249] [INFO] [timer.py:259:stop] epoch=0/micro_step=4250/global_step=4250, RunningAvgSamplesPerSec=2.628700105037646, CurrSamplesPerSec=2.5658629301179636, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:02:35,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=4260, skipped=0, lr=[9.969861141596322e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:02:35,523] [INFO] [timer.py:259:stop] epoch=0/micro_step=4260/global_step=4260, RunningAvgSamplesPerSec=2.6287063539251747, CurrSamplesPerSec=2.622638548150653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:02:50,808] [INFO] [logging.py:96:log_dist] [Rank 0] step=4270, skipped=0, lr=[9.969676144444483e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:02:50,825] [INFO] [timer.py:259:stop] epoch=0/micro_step=4270/global_step=4270, RunningAvgSamplesPerSec=2.628701243259531, CurrSamplesPerSec=2.635735904424415, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:03:06,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=4280, skipped=0, lr=[9.969490582984312e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:03:06,111] [INFO] [timer.py:259:stop] epoch=0/micro_step=4280/global_step=4280, RunningAvgSamplesPerSec=2.6287031021972607, CurrSamplesPerSec=2.599193263496646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:03:21,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=4290, skipped=0, lr=[9.96930445723688e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:03:21,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=4290/global_step=4290, RunningAvgSamplesPerSec=2.6286838519028337, CurrSamplesPerSec=2.6335612974103633, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:03:36,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=4300, skipped=0, lr=[9.969117767223325e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:03:36,730] [INFO] [timer.py:259:stop] epoch=0/micro_step=4300/global_step=4300, RunningAvgSamplesPerSec=2.6286842834619857, CurrSamplesPerSec=2.638111963103748, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:03:52,028] [INFO] [logging.py:96:log_dist] [Rank 0] step=4310, skipped=0, lr=[9.968930512964844e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:03:52,041] [INFO] [timer.py:259:stop] epoch=0/micro_step=4310/global_step=4310, RunningAvgSamplesPerSec=2.6286777724826718, CurrSamplesPerSec=2.638892067641832, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:04:07,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=4320, skipped=0, lr=[9.9687426944827e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:04:07,342] [INFO] [timer.py:259:stop] epoch=0/micro_step=4320/global_step=4320, RunningAvgSamplesPerSec=2.628672647082944, CurrSamplesPerSec=2.6299741863888157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:04:22,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=4330, skipped=0, lr=[9.968554311798219e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:04:22,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=4330/global_step=4330, RunningAvgSamplesPerSec=2.62866996184223, CurrSamplesPerSec=2.65473947812475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:04:37,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=4340, skipped=0, lr=[9.968365364932792e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:04:37,949] [INFO] [timer.py:259:stop] epoch=0/micro_step=4340/global_step=4340, RunningAvgSamplesPerSec=2.6286540776751095, CurrSamplesPerSec=2.623677014949944, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:04:53,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=4350, skipped=0, lr=[9.968175853907875e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:04:53,261] [INFO] [timer.py:259:stop] epoch=0/micro_step=4350/global_step=4350, RunningAvgSamplesPerSec=2.62864470420066, CurrSamplesPerSec=2.6404619729173446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:05:08,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=4360, skipped=0, lr=[9.967985778744986e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:05:08,515] [INFO] [timer.py:259:stop] epoch=0/micro_step=4360/global_step=4360, RunningAvgSamplesPerSec=2.628659559590028, CurrSamplesPerSec=2.595860554646118, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:05:23,784] [INFO] [logging.py:96:log_dist] [Rank 0] step=4370, skipped=0, lr=[9.96779513946571e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:05:23,802] [INFO] [timer.py:259:stop] epoch=0/micro_step=4370/global_step=4370, RunningAvgSamplesPerSec=2.628662558048507, CurrSamplesPerSec=2.621910225545513, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:05:39,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=4380, skipped=0, lr=[9.967603936091691e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:05:39,144] [INFO] [timer.py:259:stop] epoch=0/micro_step=4380/global_step=4380, RunningAvgSamplesPerSec=2.6286413631796464, CurrSamplesPerSec=2.586161048000451, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:05:54,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=4390, skipped=0, lr=[9.967412168644643e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:05:54,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=4390/global_step=4390, RunningAvgSamplesPerSec=2.628630438324935, CurrSamplesPerSec=2.6366878031191026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:06:09,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=4400, skipped=0, lr=[9.96721983714634e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:06:09,770] [INFO] [timer.py:259:stop] epoch=0/micro_step=4400/global_step=4400, RunningAvgSamplesPerSec=2.6286274635484035, CurrSamplesPerSec=2.5929446747455622, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:06:25,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=4410, skipped=0, lr=[9.967026941618624e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:06:25,038] [INFO] [timer.py:259:stop] epoch=0/micro_step=4410/global_step=4410, RunningAvgSamplesPerSec=2.6286356347450712, CurrSamplesPerSec=2.651082333858597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:06:40,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=4420, skipped=0, lr=[9.96683348208339e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:06:40,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=4420/global_step=4420, RunningAvgSamplesPerSec=2.6286400064787574, CurrSamplesPerSec=2.6390597673561342, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:06:55,643] [INFO] [logging.py:96:log_dist] [Rank 0] step=4430, skipped=0, lr=[9.966639458562616e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:06:55,661] [INFO] [timer.py:259:stop] epoch=0/micro_step=4430/global_step=4430, RunningAvgSamplesPerSec=2.6286171446518076, CurrSamplesPerSec=2.6377772405480338, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:07:10,966] [INFO] [logging.py:96:log_dist] [Rank 0] step=4440, skipped=0, lr=[9.966444871078327e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:07:10,973] [INFO] [timer.py:259:stop] epoch=0/micro_step=4440/global_step=4440, RunningAvgSamplesPerSec=2.628612702436924, CurrSamplesPerSec=2.63770798390995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:07:26,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=4450, skipped=0, lr=[9.966249719652621e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:07:26,295] [INFO] [timer.py:259:stop] epoch=0/micro_step=4450/global_step=4450, RunningAvgSamplesPerSec=2.6286067370037114, CurrSamplesPerSec=2.6406706034992236, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:07:41,564] [INFO] [logging.py:96:log_dist] [Rank 0] step=4460, skipped=0, lr=[9.966054004307656e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:07:41,565] [INFO] [timer.py:259:stop] epoch=0/micro_step=4460/global_step=4460, RunningAvgSamplesPerSec=2.6286102946601027, CurrSamplesPerSec=2.638319392690169, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:07:56,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=4470, skipped=0, lr=[9.965857725065656e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:07:56,838] [INFO] [timer.py:259:stop] epoch=0/micro_step=4470/global_step=4470, RunningAvgSamplesPerSec=2.62861397743116, CurrSamplesPerSec=2.635377774193195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:08:12,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=4480, skipped=0, lr=[9.965660881948908e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:08:12,110] [INFO] [timer.py:259:stop] epoch=0/micro_step=4480/global_step=4480, RunningAvgSamplesPerSec=2.6286197208386177, CurrSamplesPerSec=2.6420657842837847, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:08:27,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=4490, skipped=0, lr=[9.965463474979766e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:08:27,383] [INFO] [timer.py:259:stop] epoch=0/micro_step=4490/global_step=4490, RunningAvgSamplesPerSec=2.628627883708219, CurrSamplesPerSec=2.6460777981131027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:08:42,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=4500, skipped=0, lr=[9.965265504180643e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:08:42,667] [INFO] [timer.py:259:stop] epoch=0/micro_step=4500/global_step=4500, RunningAvgSamplesPerSec=2.6286283530055568, CurrSamplesPerSec=2.625584240912528, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:08:57,952] [INFO] [logging.py:96:log_dist] [Rank 0] step=4510, skipped=0, lr=[9.965066969574021e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:08:57,953] [INFO] [timer.py:259:stop] epoch=0/micro_step=4510/global_step=4510, RunningAvgSamplesPerSec=2.6286348179998464, CurrSamplesPerSec=2.6134085893856596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:09:13,216] [INFO] [logging.py:96:log_dist] [Rank 0] step=4520, skipped=0, lr=[9.964867871182443e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:09:13,230] [INFO] [timer.py:259:stop] epoch=0/micro_step=4520/global_step=4520, RunningAvgSamplesPerSec=2.6286402470162122, CurrSamplesPerSec=2.618765914985762, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:09:28,538] [INFO] [logging.py:96:log_dist] [Rank 0] step=4530, skipped=0, lr=[9.964668209028513e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:09:28,540] [INFO] [timer.py:259:stop] epoch=0/micro_step=4530/global_step=4530, RunningAvgSamplesPerSec=2.628632462962331, CurrSamplesPerSec=2.624201892821425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:09:43,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=4540, skipped=0, lr=[9.964467983134907e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:09:43,804] [INFO] [timer.py:259:stop] epoch=0/micro_step=4540/global_step=4540, RunningAvgSamplesPerSec=2.6286382693832913, CurrSamplesPerSec=2.6392739890797263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:09:59,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=4550, skipped=0, lr=[9.96426719352436e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:09:59,141] [INFO] [timer.py:259:stop] epoch=0/micro_step=4550/global_step=4550, RunningAvgSamplesPerSec=2.6286249843301848, CurrSamplesPerSec=2.605732241223345, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:10:14,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=4560, skipped=0, lr=[9.96406584021967e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:10:14,425] [INFO] [timer.py:259:stop] epoch=0/micro_step=4560/global_step=4560, RunningAvgSamplesPerSec=2.6286257425014714, CurrSamplesPerSec=2.6296699653830045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:10:29,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=4570, skipped=0, lr=[9.963863923243702e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:10:29,724] [INFO] [timer.py:259:stop] epoch=0/micro_step=4570/global_step=4570, RunningAvgSamplesPerSec=2.6286194409447177, CurrSamplesPerSec=2.6249590059736283, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:10:45,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=4580, skipped=0, lr=[9.963661442619383e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:10:45,014] [INFO] [timer.py:259:stop] epoch=0/micro_step=4580/global_step=4580, RunningAvgSamplesPerSec=2.6286183274310817, CurrSamplesPerSec=2.631706866514369, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:11:00,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=4590, skipped=0, lr=[9.963458398369707e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:11:00,287] [INFO] [timer.py:259:stop] epoch=0/micro_step=4590/global_step=4590, RunningAvgSamplesPerSec=2.628625164514872, CurrSamplesPerSec=2.645696825666718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:11:15,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=4600, skipped=0, lr=[9.963254790517727e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:11:15,571] [INFO] [timer.py:259:stop] epoch=0/micro_step=4600/global_step=4600, RunningAvgSamplesPerSec=2.62862541822001, CurrSamplesPerSec=2.632591825854315, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:11:30,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=4610, skipped=0, lr=[9.963050619086565e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:11:30,888] [INFO] [timer.py:259:stop] epoch=0/micro_step=4610/global_step=4610, RunningAvgSamplesPerSec=2.6286218860767168, CurrSamplesPerSec=2.635973193425481, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:11:46,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=4620, skipped=0, lr=[9.962845884099404e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:11:46,181] [INFO] [timer.py:259:stop] epoch=0/micro_step=4620/global_step=4620, RunningAvgSamplesPerSec=2.628619448798748, CurrSamplesPerSec=2.6233861449502816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:12:01,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=4630, skipped=0, lr=[9.96264058557949e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:12:01,473] [INFO] [timer.py:259:stop] epoch=0/micro_step=4630/global_step=4630, RunningAvgSamplesPerSec=2.6286199544609774, CurrSamplesPerSec=2.635292913663504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:12:16,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=4640, skipped=0, lr=[9.962434723550136e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:12:16,721] [INFO] [timer.py:259:stop] epoch=0/micro_step=4640/global_step=4640, RunningAvgSamplesPerSec=2.6286363475353287, CurrSamplesPerSec=2.6329413483884156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:12:32,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=4650, skipped=0, lr=[9.962228298034717e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:12:32,016] [INFO] [timer.py:259:stop] epoch=0/micro_step=4650/global_step=4650, RunningAvgSamplesPerSec=2.6286349659454773, CurrSamplesPerSec=2.6490383461069733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:12:47,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=4660, skipped=0, lr=[9.962021309056676e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:12:47,287] [INFO] [timer.py:259:stop] epoch=0/micro_step=4660/global_step=4660, RunningAvgSamplesPerSec=2.628643187922162, CurrSamplesPerSec=2.643849852041082, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:13:02,550] [INFO] [logging.py:96:log_dist] [Rank 0] step=4670, skipped=0, lr=[9.961813756639511e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:13:02,551] [INFO] [timer.py:259:stop] epoch=0/micro_step=4670/global_step=4670, RunningAvgSamplesPerSec=2.6286508350214657, CurrSamplesPerSec=2.6364130986003618, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:13:17,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=4680, skipped=0, lr=[9.961605640806794e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:13:17,834] [INFO] [timer.py:259:stop] epoch=0/micro_step=4680/global_step=4680, RunningAvgSamplesPerSec=2.628652694112539, CurrSamplesPerSec=2.6129047041495648, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:13:33,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=4690, skipped=0, lr=[9.961396961582154e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:13:33,092] [INFO] [timer.py:259:stop] epoch=0/micro_step=4690/global_step=4690, RunningAvgSamplesPerSec=2.6286621659813307, CurrSamplesPerSec=2.62378533845458, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:13:48,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=4700, skipped=0, lr=[9.96118771898929e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:13:48,389] [INFO] [timer.py:259:stop] epoch=0/micro_step=4700/global_step=4700, RunningAvgSamplesPerSec=2.6286576070395413, CurrSamplesPerSec=2.5919632245312685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:14:03,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=4710, skipped=0, lr=[9.96097791305196e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:14:03,626] [INFO] [timer.py:259:stop] epoch=0/micro_step=4710/global_step=4710, RunningAvgSamplesPerSec=2.6286751998289395, CurrSamplesPerSec=2.643424121667499, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:14:18,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=4720, skipped=0, lr=[9.960767543793984e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:14:18,925] [INFO] [timer.py:259:stop] epoch=0/micro_step=4720/global_step=4720, RunningAvgSamplesPerSec=2.6286733242306783, CurrSamplesPerSec=2.619095011981361, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:14:34,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=4730, skipped=0, lr=[9.960556611239255e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:14:34,200] [INFO] [timer.py:259:stop] epoch=0/micro_step=4730/global_step=4730, RunningAvgSamplesPerSec=2.628685330273553, CurrSamplesPerSec=2.639680940055363, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:14:49,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=4740, skipped=0, lr=[9.96034511541172e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:14:49,528] [INFO] [timer.py:259:stop] epoch=0/micro_step=4740/global_step=4740, RunningAvgSamplesPerSec=2.62867092304699, CurrSamplesPerSec=2.625377576442955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:15:04,811] [INFO] [logging.py:96:log_dist] [Rank 0] step=4750, skipped=0, lr=[9.960133056335399e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:15:04,812] [INFO] [timer.py:259:stop] epoch=0/micro_step=4750/global_step=4750, RunningAvgSamplesPerSec=2.6286754359714863, CurrSamplesPerSec=2.6349092467096793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:15:20,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=4760, skipped=0, lr=[9.959920434034367e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:15:20,071] [INFO] [timer.py:259:stop] epoch=0/micro_step=4760/global_step=4760, RunningAvgSamplesPerSec=2.6286864064099245, CurrSamplesPerSec=2.6158811730890337, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:15:35,376] [INFO] [logging.py:96:log_dist] [Rank 0] step=4770, skipped=0, lr=[9.959707248532768e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:15:35,377] [INFO] [timer.py:259:stop] epoch=0/micro_step=4770/global_step=4770, RunningAvgSamplesPerSec=2.628683856553896, CurrSamplesPerSec=2.6274098881370374, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:15:50,707] [INFO] [logging.py:96:log_dist] [Rank 0] step=4780, skipped=0, lr=[9.959493499854812e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:15:50,708] [INFO] [timer.py:259:stop] epoch=0/micro_step=4780/global_step=4780, RunningAvgSamplesPerSec=2.6286680126533137, CurrSamplesPerSec=2.6311723800791085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:16:05,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=4790, skipped=0, lr=[9.959279188024769e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:16:05,957] [INFO] [timer.py:259:stop] epoch=0/micro_step=4790/global_step=4790, RunningAvgSamplesPerSec=2.6286840784671806, CurrSamplesPerSec=2.637496918745135, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:16:21,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=4800, skipped=0, lr=[9.959064313066973e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:16:21,315] [INFO] [timer.py:259:stop] epoch=0/micro_step=4800/global_step=4800, RunningAvgSamplesPerSec=2.62865799153279, CurrSamplesPerSec=2.549428912865837, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:16:36,561] [INFO] [logging.py:96:log_dist] [Rank 0] step=4810, skipped=0, lr=[9.958848875005824e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:16:36,568] [INFO] [timer.py:259:stop] epoch=0/micro_step=4810/global_step=4810, RunningAvgSamplesPerSec=2.6286676922108976, CurrSamplesPerSec=2.64751630048348, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:16:51,811] [INFO] [logging.py:96:log_dist] [Rank 0] step=4820, skipped=0, lr=[9.958632873865785e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:16:51,832] [INFO] [timer.py:259:stop] epoch=0/micro_step=4820/global_step=4820, RunningAvgSamplesPerSec=2.628674184208855, CurrSamplesPerSec=2.652072181957252, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:17:07,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=4830, skipped=0, lr=[9.958416309671381e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:17:07,163] [INFO] [timer.py:259:stop] epoch=0/micro_step=4830/global_step=4830, RunningAvgSamplesPerSec=2.6286589848526654, CurrSamplesPerSec=2.623155628088992, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:17:22,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=4840, skipped=0, lr=[9.958199182447208e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:17:22,442] [INFO] [timer.py:259:stop] epoch=0/micro_step=4840/global_step=4840, RunningAvgSamplesPerSec=2.628666696315772, CurrSamplesPerSec=2.6315314319715304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:17:37,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=4850, skipped=0, lr=[9.957981492217917e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:17:37,653] [INFO] [timer.py:259:stop] epoch=0/micro_step=4850/global_step=4850, RunningAvgSamplesPerSec=2.62869060268066, CurrSamplesPerSec=2.619515395254562, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:17:53,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=4860, skipped=0, lr=[9.957763239008227e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:17:53,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=4860/global_step=4860, RunningAvgSamplesPerSec=2.628659432387042, CurrSamplesPerSec=2.624918347261092, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:18:08,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=4870, skipped=0, lr=[9.957544422842921e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:18:08,355] [INFO] [timer.py:259:stop] epoch=0/micro_step=4870/global_step=4870, RunningAvgSamplesPerSec=2.628656169024766, CurrSamplesPerSec=2.6087455205257153, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:18:23,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=4880, skipped=0, lr=[9.957325043746847e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:18:23,617] [INFO] [timer.py:259:stop] epoch=0/micro_step=4880/global_step=4880, RunningAvgSamplesPerSec=2.628665379434373, CurrSamplesPerSec=2.6408838397308854, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:18:38,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=4890, skipped=0, lr=[9.957105101744913e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:18:38,918] [INFO] [timer.py:259:stop] epoch=0/micro_step=4890/global_step=4890, RunningAvgSamplesPerSec=2.6286592967466653, CurrSamplesPerSec=2.5959553464273144, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:18:54,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=4900, skipped=0, lr=[9.956884596862096e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:18:54,235] [INFO] [timer.py:259:stop] epoch=0/micro_step=4900/global_step=4900, RunningAvgSamplesPerSec=2.628658052146778, CurrSamplesPerSec=2.6302005427988058, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:19:09,685] [INFO] [logging.py:96:log_dist] [Rank 0] step=4910, skipped=0, lr=[9.956663529123431e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:19:09,686] [INFO] [timer.py:259:stop] epoch=0/micro_step=4910/global_step=4910, RunningAvgSamplesPerSec=2.628607760768292, CurrSamplesPerSec=2.652285167131244, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:19:24,946] [INFO] [logging.py:96:log_dist] [Rank 0] step=4920, skipped=0, lr=[9.956441898554025e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:19:24,960] [INFO] [timer.py:259:stop] epoch=0/micro_step=4920/global_step=4920, RunningAvgSamplesPerSec=2.6286118153946814, CurrSamplesPerSec=2.6288211570941433, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:19:40,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=4930, skipped=0, lr=[9.956219705179042e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:19:40,211] [INFO] [timer.py:259:stop] epoch=0/micro_step=4930/global_step=4930, RunningAvgSamplesPerSec=2.628624186456211, CurrSamplesPerSec=2.6313197033116618, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:19:55,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=4940, skipped=0, lr=[9.955996949023712e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:19:55,470] [INFO] [timer.py:259:stop] epoch=0/micro_step=4940/global_step=4940, RunningAvgSamplesPerSec=2.62863778995793, CurrSamplesPerSec=2.642839486618951, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:20:10,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=4950, skipped=0, lr=[9.955773630113329e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:20:10,780] [INFO] [timer.py:259:stop] epoch=0/micro_step=4950/global_step=4950, RunningAvgSamplesPerSec=2.628630659128092, CurrSamplesPerSec=2.6220417609177478, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:20:26,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=4960, skipped=0, lr=[9.95554974847325e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:20:26,020] [INFO] [timer.py:259:stop] epoch=0/micro_step=4960/global_step=4960, RunningAvgSamplesPerSec=2.6286488564213175, CurrSamplesPerSec=2.641844869101311, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:20:41,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=4970, skipped=0, lr=[9.955325304128899e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:20:41,310] [INFO] [timer.py:259:stop] epoch=0/micro_step=4970/global_step=4970, RunningAvgSamplesPerSec=2.6286512655637617, CurrSamplesPerSec=2.6260387706706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:20:56,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=4980, skipped=0, lr=[9.95510029710576e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:20:56,538] [INFO] [timer.py:259:stop] epoch=0/micro_step=4980/global_step=4980, RunningAvgSamplesPerSec=2.6286690444945484, CurrSamplesPerSec=2.642722923662465, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:21:11,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=4990, skipped=0, lr=[9.954874727429384e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:21:11,837] [INFO] [timer.py:259:stop] epoch=0/micro_step=4990/global_step=4990, RunningAvgSamplesPerSec=2.628666870826591, CurrSamplesPerSec=2.6300471604291413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:21:27,075] [INFO] [logging.py:96:log_dist] [Rank 0] step=5000, skipped=0, lr=[9.954648595125382e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:21:27,076] [INFO] [timer.py:259:stop] epoch=0/micro_step=5000/global_step=5000, RunningAvgSamplesPerSec=2.6286832869472656, CurrSamplesPerSec=2.6365510652078865, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:21:42,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=5010, skipped=0, lr=[9.954421900219435e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:21:42,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=5010/global_step=5010, RunningAvgSamplesPerSec=2.628683520449172, CurrSamplesPerSec=2.6256643680970635, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:21:57,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=5020, skipped=0, lr=[9.95419464273728e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:21:57,713] [INFO] [timer.py:259:stop] epoch=0/micro_step=5020/global_step=5020, RunningAvgSamplesPerSec=2.6286749939445593, CurrSamplesPerSec=2.607257256138171, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:22:12,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=5030, skipped=0, lr=[9.953966822704726e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:22:12,982] [INFO] [timer.py:259:stop] epoch=0/micro_step=5030/global_step=5030, RunningAvgSamplesPerSec=2.628684396124052, CurrSamplesPerSec=2.6338312731045668, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:22:28,251] [INFO] [logging.py:96:log_dist] [Rank 0] step=5040, skipped=0, lr=[9.953738440147642e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:22:28,255] [INFO] [timer.py:259:stop] epoch=0/micro_step=5040/global_step=5040, RunningAvgSamplesPerSec=2.6286922674249786, CurrSamplesPerSec=2.612442098626878, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:22:43,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=5050, skipped=0, lr=[9.953509495091957e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:22:43,561] [INFO] [timer.py:259:stop] epoch=0/micro_step=5050/global_step=5050, RunningAvgSamplesPerSec=2.6286852968440773, CurrSamplesPerSec=2.617456478955112, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:22:58,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=5060, skipped=0, lr=[9.953279987563673e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:22:58,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=5060/global_step=5060, RunningAvgSamplesPerSec=2.6286892843747527, CurrSamplesPerSec=2.6065046492264514, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:23:14,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=5070, skipped=0, lr=[9.953049917588845e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:23:14,100] [INFO] [timer.py:259:stop] epoch=0/micro_step=5070/global_step=5070, RunningAvgSamplesPerSec=2.628696716200856, CurrSamplesPerSec=2.6410501296947846, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:23:29,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=5080, skipped=0, lr=[9.952819285193602e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:23:29,455] [INFO] [timer.py:259:stop] epoch=0/micro_step=5080/global_step=5080, RunningAvgSamplesPerSec=2.6286764745144113, CurrSamplesPerSec=2.6419226636423643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:23:44,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=5090, skipped=0, lr=[9.952588090404129e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:23:44,708] [INFO] [timer.py:259:stop] epoch=0/micro_step=5090/global_step=5090, RunningAvgSamplesPerSec=2.6286901686453605, CurrSamplesPerSec=2.6385920047633107, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:23:59,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=5100, skipped=0, lr=[9.952356333246683e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:24:00,015] [INFO] [timer.py:259:stop] epoch=0/micro_step=5100/global_step=5100, RunningAvgSamplesPerSec=2.628681681018669, CurrSamplesPerSec=2.636366284384085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:24:15,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=5110, skipped=0, lr=[9.952124013747575e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:24:15,346] [INFO] [timer.py:259:stop] epoch=0/micro_step=5110/global_step=5110, RunningAvgSamplesPerSec=2.628665328405857, CurrSamplesPerSec=2.6383812130157596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:24:30,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=5120, skipped=0, lr=[9.951891131933189e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:24:30,646] [INFO] [timer.py:259:stop] epoch=0/micro_step=5120/global_step=5120, RunningAvgSamplesPerSec=2.628661955563286, CurrSamplesPerSec=2.6558026863782067, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:24:45,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=5130, skipped=0, lr=[9.951657687829965e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:24:45,954] [INFO] [timer.py:259:stop] epoch=0/micro_step=5130/global_step=5130, RunningAvgSamplesPerSec=2.6286561134885753, CurrSamplesPerSec=2.6278452931205987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:25:01,187] [INFO] [logging.py:96:log_dist] [Rank 0] step=5140, skipped=0, lr=[9.951423681464414e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:25:01,207] [INFO] [timer.py:259:stop] epoch=0/micro_step=5140/global_step=5140, RunningAvgSamplesPerSec=2.628671819099495, CurrSamplesPerSec=2.647461153434903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:25:16,474] [INFO] [logging.py:96:log_dist] [Rank 0] step=5150, skipped=0, lr=[9.951189112863104e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:25:16,481] [INFO] [timer.py:259:stop] epoch=0/micro_step=5150/global_step=5150, RunningAvgSamplesPerSec=2.6286750822183187, CurrSamplesPerSec=2.6342514372484698, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:25:31,770] [INFO] [logging.py:96:log_dist] [Rank 0] step=5160, skipped=0, lr=[9.950953982052674e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:25:31,771] [INFO] [timer.py:259:stop] epoch=0/micro_step=5160/global_step=5160, RunningAvgSamplesPerSec=2.6286814995381476, CurrSamplesPerSec=2.6375690668875875, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:25:47,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=5170, skipped=0, lr=[9.950718289059823e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:25:47,028] [INFO] [timer.py:259:stop] epoch=0/micro_step=5170/global_step=5170, RunningAvgSamplesPerSec=2.628691828875264, CurrSamplesPerSec=2.6245684879735305, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:26:02,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=5180, skipped=0, lr=[9.950482033911311e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:26:02,373] [INFO] [timer.py:259:stop] epoch=0/micro_step=5180/global_step=5180, RunningAvgSamplesPerSec=2.6286761282062012, CurrSamplesPerSec=2.6348091060870504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:26:17,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=5190, skipped=0, lr=[9.950245216633967e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:26:17,727] [INFO] [timer.py:259:stop] epoch=0/micro_step=5190/global_step=5190, RunningAvgSamplesPerSec=2.6286617024744756, CurrSamplesPerSec=2.6297561129684746, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:26:33,052] [INFO] [logging.py:96:log_dist] [Rank 0] step=5200, skipped=0, lr=[9.950007837254679e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:26:33,054] [INFO] [timer.py:259:stop] epoch=0/micro_step=5200/global_step=5200, RunningAvgSamplesPerSec=2.6286524464577843, CurrSamplesPerSec=2.6343043808222406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:26:48,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=5210, skipped=0, lr=[9.949769895800404e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:26:48,413] [INFO] [timer.py:259:stop] epoch=0/micro_step=5210/global_step=5210, RunningAvgSamplesPerSec=2.6286292990825286, CurrSamplesPerSec=2.5970439396456992, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:27:03,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=5220, skipped=0, lr=[9.949531392298163e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:27:03,777] [INFO] [timer.py:259:stop] epoch=0/micro_step=5220/global_step=5220, RunningAvgSamplesPerSec=2.6286117552572064, CurrSamplesPerSec=2.6266159954018775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:27:19,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=5230, skipped=0, lr=[9.949292326775033e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:27:19,115] [INFO] [timer.py:259:stop] epoch=0/micro_step=5230/global_step=5230, RunningAvgSamplesPerSec=2.628596955182838, CurrSamplesPerSec=2.609541228255119, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:27:34,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=5240, skipped=0, lr=[9.949052699258163e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:27:34,390] [INFO] [timer.py:259:stop] epoch=0/micro_step=5240/global_step=5240, RunningAvgSamplesPerSec=2.6286069067837095, CurrSamplesPerSec=2.6342774951477357, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:27:49,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=5250, skipped=0, lr=[9.948812509774761e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:27:49,728] [INFO] [timer.py:259:stop] epoch=0/micro_step=5250/global_step=5250, RunningAvgSamplesPerSec=2.628600317689881, CurrSamplesPerSec=2.6270124704468367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:28:05,009] [INFO] [logging.py:96:log_dist] [Rank 0] step=5260, skipped=0, lr=[9.948571758352102e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:28:05,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=5260/global_step=5260, RunningAvgSamplesPerSec=2.6286077571405486, CurrSamplesPerSec=2.6443769978860345, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:28:20,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=5270, skipped=0, lr=[9.948330445017522e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:28:20,323] [INFO] [timer.py:259:stop] epoch=0/micro_step=5270/global_step=5270, RunningAvgSamplesPerSec=2.6286043653120714, CurrSamplesPerSec=2.6316965461878747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:28:35,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=5280, skipped=0, lr=[9.948088569798424e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:28:35,608] [INFO] [timer.py:259:stop] epoch=0/micro_step=5280/global_step=5280, RunningAvgSamplesPerSec=2.628610421966762, CurrSamplesPerSec=2.631647835339469, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:28:50,881] [INFO] [logging.py:96:log_dist] [Rank 0] step=5290, skipped=0, lr=[9.947846132722273e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:28:50,902] [INFO] [timer.py:259:stop] epoch=0/micro_step=5290/global_step=5290, RunningAvgSamplesPerSec=2.628614200873123, CurrSamplesPerSec=2.6541561260808186, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:29:06,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=5300, skipped=0, lr=[9.947603133816595e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:29:06,225] [INFO] [timer.py:259:stop] epoch=0/micro_step=5300/global_step=5300, RunningAvgSamplesPerSec=2.628607873544156, CurrSamplesPerSec=2.634735867558566, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:29:21,596] [INFO] [logging.py:96:log_dist] [Rank 0] step=5310, skipped=0, lr=[9.947359573108986e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:29:21,598] [INFO] [timer.py:259:stop] epoch=0/micro_step=5310/global_step=5310, RunningAvgSamplesPerSec=2.628584104976207, CurrSamplesPerSec=2.6345600289589735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:29:36,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=5320, skipped=0, lr=[9.947115450627099e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:29:36,918] [INFO] [timer.py:259:stop] epoch=0/micro_step=5320/global_step=5320, RunningAvgSamplesPerSec=2.6285735587917665, CurrSamplesPerSec=2.592738708892437, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:29:52,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=5330, skipped=0, lr=[9.946870766398657e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:29:52,233] [INFO] [timer.py:259:stop] epoch=0/micro_step=5330/global_step=5330, RunningAvgSamplesPerSec=2.628566150739441, CurrSamplesPerSec=2.6476069638821587, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:30:07,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=5340, skipped=0, lr=[9.946625520451443e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:30:07,524] [INFO] [timer.py:259:stop] epoch=0/micro_step=5340/global_step=5340, RunningAvgSamplesPerSec=2.6285645546773764, CurrSamplesPerSec=2.6330297765981765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:30:22,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=5350, skipped=0, lr=[9.946379712813305e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:30:22,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=5350/global_step=5350, RunningAvgSamplesPerSec=2.62855720777206, CurrSamplesPerSec=2.635609202141983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:30:38,131] [INFO] [logging.py:96:log_dist] [Rank 0] step=5360, skipped=0, lr=[9.946133343512155e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:30:38,132] [INFO] [timer.py:259:stop] epoch=0/micro_step=5360/global_step=5360, RunningAvgSamplesPerSec=2.6285587475149903, CurrSamplesPerSec=2.6000903355575256, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:30:53,440] [INFO] [logging.py:96:log_dist] [Rank 0] step=5370, skipped=0, lr=[9.945886412575967e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:30:53,442] [INFO] [timer.py:259:stop] epoch=0/micro_step=5370/global_step=5370, RunningAvgSamplesPerSec=2.628555260421133, CurrSamplesPerSec=2.648822954632421, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:31:08,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=5380, skipped=0, lr=[9.94563892003278e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:31:08,779] [INFO] [timer.py:259:stop] epoch=0/micro_step=5380/global_step=5380, RunningAvgSamplesPerSec=2.628538974521648, CurrSamplesPerSec=2.6142193650516723, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:31:24,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=5390, skipped=0, lr=[9.945390865910698e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:31:24,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=5390/global_step=5390, RunningAvgSamplesPerSec=2.628550831659337, CurrSamplesPerSec=2.640319441496339, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:31:39,339] [INFO] [logging.py:96:log_dist] [Rank 0] step=5400, skipped=0, lr=[9.945142250237889e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:31:39,354] [INFO] [timer.py:259:stop] epoch=0/micro_step=5400/global_step=5400, RunningAvgSamplesPerSec=2.6285397816505363, CurrSamplesPerSec=2.632279978068664, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:31:54,656] [INFO] [logging.py:96:log_dist] [Rank 0] step=5410, skipped=0, lr=[9.944893073042581e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:31:54,663] [INFO] [timer.py:259:stop] epoch=0/micro_step=5410/global_step=5410, RunningAvgSamplesPerSec=2.628532675207557, CurrSamplesPerSec=2.6433549846675017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:32:09,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=5420, skipped=0, lr=[9.944643334353067e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:32:09,907] [INFO] [timer.py:259:stop] epoch=0/micro_step=5420/global_step=5420, RunningAvgSamplesPerSec=2.6285451025327724, CurrSamplesPerSec=2.6491713626083366, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:32:25,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=5430, skipped=0, lr=[9.944393034197708e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:32:25,159] [INFO] [timer.py:259:stop] epoch=0/micro_step=5430/global_step=5430, RunningAvgSamplesPerSec=2.6285556286119087, CurrSamplesPerSec=2.6351310728779422, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:32:40,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=5440, skipped=0, lr=[9.944142172604923e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:32:40,434] [INFO] [timer.py:259:stop] epoch=0/micro_step=5440/global_step=5440, RunningAvgSamplesPerSec=2.6285613797988825, CurrSamplesPerSec=2.6299082246393253, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:32:55,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=5450, skipped=0, lr=[9.9438907496032e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:32:55,808] [INFO] [timer.py:259:stop] epoch=0/micro_step=5450/global_step=5450, RunningAvgSamplesPerSec=2.6285438837236614, CurrSamplesPerSec=2.631186822803892, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:33:11,080] [INFO] [logging.py:96:log_dist] [Rank 0] step=5460, skipped=0, lr=[9.943638765221087e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:33:11,103] [INFO] [timer.py:259:stop] epoch=0/micro_step=5460/global_step=5460, RunningAvgSamplesPerSec=2.628543558072842, CurrSamplesPerSec=2.6487066996161257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:33:26,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=5470, skipped=0, lr=[9.943386219487196e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:33:26,417] [INFO] [timer.py:259:stop] epoch=0/micro_step=5470/global_step=5470, RunningAvgSamplesPerSec=2.6285386754781555, CurrSamplesPerSec=2.611979656880534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:33:41,735] [INFO] [logging.py:96:log_dist] [Rank 0] step=5480, skipped=0, lr=[9.943133112430205e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:33:41,748] [INFO] [timer.py:259:stop] epoch=0/micro_step=5480/global_step=5480, RunningAvgSamplesPerSec=2.6285286786268567, CurrSamplesPerSec=2.6347354537931356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:33:57,092] [INFO] [logging.py:96:log_dist] [Rank 0] step=5490, skipped=0, lr=[9.942879444078854e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:33:57,111] [INFO] [timer.py:259:stop] epoch=0/micro_step=5490/global_step=5490, RunningAvgSamplesPerSec=2.6285043467803266, CurrSamplesPerSec=2.624787344421639, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:34:12,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=5500, skipped=0, lr=[9.942625214461948e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:34:12,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=5500/global_step=5500, RunningAvgSamplesPerSec=2.628507127150622, CurrSamplesPerSec=2.632768641258459, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:34:27,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=5510, skipped=0, lr=[9.942370423608352e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:34:27,661] [INFO] [timer.py:259:stop] epoch=0/micro_step=5510/global_step=5510, RunningAvgSamplesPerSec=2.6285111047564382, CurrSamplesPerSec=2.6249988445197068, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:34:42,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=5520, skipped=0, lr=[9.942115071547e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:34:42,923] [INFO] [timer.py:259:stop] epoch=0/micro_step=5520/global_step=5520, RunningAvgSamplesPerSec=2.628519083709194, CurrSamplesPerSec=2.6412014717623955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:34:58,213] [INFO] [logging.py:96:log_dist] [Rank 0] step=5530, skipped=0, lr=[9.941859158306888e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:34:58,214] [INFO] [timer.py:259:stop] epoch=0/micro_step=5530/global_step=5530, RunningAvgSamplesPerSec=2.6285202966775003, CurrSamplesPerSec=2.6039927625597605, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:35:13,501] [INFO] [logging.py:96:log_dist] [Rank 0] step=5540, skipped=0, lr=[9.941602683917072e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:35:13,502] [INFO] [timer.py:259:stop] epoch=0/micro_step=5540/global_step=5540, RunningAvgSamplesPerSec=2.6285205687535083, CurrSamplesPerSec=2.6264025900313626, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:35:28,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=5550, skipped=0, lr=[9.941345648406678e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:35:28,874] [INFO] [timer.py:259:stop] epoch=0/micro_step=5550/global_step=5550, RunningAvgSamplesPerSec=2.6284954895232255, CurrSamplesPerSec=2.572334459008709, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:35:44,131] [INFO] [logging.py:96:log_dist] [Rank 0] step=5560, skipped=0, lr=[9.941088051804888e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:35:44,154] [INFO] [timer.py:259:stop] epoch=0/micro_step=5560/global_step=5560, RunningAvgSamplesPerSec=2.6285006099539334, CurrSamplesPerSec=2.585983262540919, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:35:59,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=5570, skipped=0, lr=[9.940829894140958e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:35:59,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=5570/global_step=5570, RunningAvgSamplesPerSec=2.6285154204481924, CurrSamplesPerSec=2.6450515509710044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:36:14,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=5580, skipped=0, lr=[9.9405711754442e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:36:14,638] [INFO] [timer.py:259:stop] epoch=0/micro_step=5580/global_step=5580, RunningAvgSamplesPerSec=2.628528534127673, CurrSamplesPerSec=2.6336154534868363, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:36:30,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=5590, skipped=0, lr=[9.940311895743988e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:36:30,028] [INFO] [timer.py:259:stop] epoch=0/micro_step=5590/global_step=5590, RunningAvgSamplesPerSec=2.628503674178809, CurrSamplesPerSec=2.631646184155868, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:36:45,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=5600, skipped=0, lr=[9.940052055069766e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:36:45,407] [INFO] [timer.py:259:stop] epoch=0/micro_step=5600/global_step=5600, RunningAvgSamplesPerSec=2.628478272717033, CurrSamplesPerSec=2.6268240887490286, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:37:00,738] [INFO] [logging.py:96:log_dist] [Rank 0] step=5610, skipped=0, lr=[9.939791653451041e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:37:00,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=5610/global_step=5610, RunningAvgSamplesPerSec=2.628470457914662, CurrSamplesPerSec=2.5870443582628666, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:37:16,067] [INFO] [logging.py:96:log_dist] [Rank 0] step=5620, skipped=0, lr=[9.939530690917378e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:37:16,068] [INFO] [timer.py:259:stop] epoch=0/micro_step=5620/global_step=5620, RunningAvgSamplesPerSec=2.628464766984721, CurrSamplesPerSec=2.629959344706694, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:37:31,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=5630, skipped=0, lr=[9.93926916749841e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:37:31,559] [INFO] [timer.py:259:stop] epoch=0/micro_step=5630/global_step=5630, RunningAvgSamplesPerSec=2.6284175783984587, CurrSamplesPerSec=2.6130035935981923, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:37:46,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=5640, skipped=0, lr=[9.939007083223837e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:37:46,925] [INFO] [timer.py:259:stop] epoch=0/micro_step=5640/global_step=5640, RunningAvgSamplesPerSec=2.6284054992204866, CurrSamplesPerSec=2.616100214777442, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:38:02,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=5650, skipped=0, lr=[9.938744438123415e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:38:02,302] [INFO] [timer.py:259:stop] epoch=0/micro_step=5650/global_step=5650, RunningAvgSamplesPerSec=2.6283884830580746, CurrSamplesPerSec=2.6359885172204165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:38:17,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=5660, skipped=0, lr=[9.938481232226967e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:38:17,693] [INFO] [timer.py:259:stop] epoch=0/micro_step=5660/global_step=5660, RunningAvgSamplesPerSec=2.6283681941809705, CurrSamplesPerSec=2.61920663757132, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:38:33,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=5670, skipped=0, lr=[9.93821746556438e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:38:33,041] [INFO] [timer.py:259:stop] epoch=0/micro_step=5670/global_step=5670, RunningAvgSamplesPerSec=2.628360699233549, CurrSamplesPerSec=2.6326955161201266, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:38:48,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=5680, skipped=0, lr=[9.93795313816561e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:38:48,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=5680/global_step=5680, RunningAvgSamplesPerSec=2.6283391652241908, CurrSamplesPerSec=2.583166674711079, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:39:03,808] [INFO] [logging.py:96:log_dist] [Rank 0] step=5690, skipped=0, lr=[9.937688250060667e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:39:03,816] [INFO] [timer.py:259:stop] epoch=0/micro_step=5690/global_step=5690, RunningAvgSamplesPerSec=2.6283219194241583, CurrSamplesPerSec=2.6443815826848556, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:39:19,029] [INFO] [logging.py:96:log_dist] [Rank 0] step=5700, skipped=0, lr=[9.937422801279628e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:39:19,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=5700/global_step=5700, RunningAvgSamplesPerSec=2.6283399258698297, CurrSamplesPerSec=2.6063511837292483, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:39:34,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=5710, skipped=0, lr=[9.937156791852635e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:39:34,379] [INFO] [timer.py:259:stop] epoch=0/micro_step=5710/global_step=5710, RunningAvgSamplesPerSec=2.628328590425952, CurrSamplesPerSec=2.637830325942307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:39:49,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=5720, skipped=0, lr=[9.936890221809898e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:39:49,717] [INFO] [timer.py:259:stop] epoch=0/micro_step=5720/global_step=5720, RunningAvgSamplesPerSec=2.6283138895331253, CurrSamplesPerSec=2.628443490018469, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:40:04,980] [INFO] [logging.py:96:log_dist] [Rank 0] step=5730, skipped=0, lr=[9.936623091181683e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:40:04,999] [INFO] [timer.py:259:stop] epoch=0/micro_step=5730/global_step=5730, RunningAvgSamplesPerSec=2.6283160976163344, CurrSamplesPerSec=2.625146709490181, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:40:20,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=5740, skipped=0, lr=[9.936355399998321e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:40:20,319] [INFO] [timer.py:259:stop] epoch=0/micro_step=5740/global_step=5740, RunningAvgSamplesPerSec=2.6283103428447196, CurrSamplesPerSec=2.6444153439657403, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:40:35,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=5750, skipped=0, lr=[9.936087148290213e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:40:35,612] [INFO] [timer.py:259:stop] epoch=0/micro_step=5750/global_step=5750, RunningAvgSamplesPerSec=2.6283068905027602, CurrSamplesPerSec=2.612650800097718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:40:50,896] [INFO] [logging.py:96:log_dist] [Rank 0] step=5760, skipped=0, lr=[9.935818336087813e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:40:50,915] [INFO] [timer.py:259:stop] epoch=0/micro_step=5760/global_step=5760, RunningAvgSamplesPerSec=2.6283012826600167, CurrSamplesPerSec=2.6101477689704056, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:41:06,168] [INFO] [logging.py:96:log_dist] [Rank 0] step=5770, skipped=0, lr=[9.93554896342165e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:41:06,170] [INFO] [timer.py:259:stop] epoch=0/micro_step=5770/global_step=5770, RunningAvgSamplesPerSec=2.62831066596353, CurrSamplesPerSec=2.636952617763807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:41:21,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=5780, skipped=0, lr=[9.93527903032231e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:41:21,505] [INFO] [timer.py:259:stop] epoch=0/micro_step=5780/global_step=5780, RunningAvgSamplesPerSec=2.6282984883010516, CurrSamplesPerSec=2.545441781093071, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:41:36,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=5790, skipped=0, lr=[9.935008536820441e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:41:36,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=5790/global_step=5790, RunningAvgSamplesPerSec=2.6282903969875013, CurrSamplesPerSec=2.6075956252474155, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:41:52,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=5800, skipped=0, lr=[9.934737482946763e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:41:52,142] [INFO] [timer.py:259:stop] epoch=0/micro_step=5800/global_step=5800, RunningAvgSamplesPerSec=2.6282873394430766, CurrSamplesPerSec=2.634827726701098, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:42:07,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=5810, skipped=0, lr=[9.93446586873205e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:42:07,476] [INFO] [timer.py:259:stop] epoch=0/micro_step=5810/global_step=5810, RunningAvgSamplesPerSec=2.6282756775771894, CurrSamplesPerSec=2.5643694787142266, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:42:22,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=5820, skipped=0, lr=[9.934193694207144e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:42:22,802] [INFO] [timer.py:259:stop] epoch=0/micro_step=5820/global_step=5820, RunningAvgSamplesPerSec=2.6282679675358196, CurrSamplesPerSec=2.626940898671699, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:42:38,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=5830, skipped=0, lr=[9.933920959402953e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:42:38,114] [INFO] [timer.py:259:stop] epoch=0/micro_step=5830/global_step=5830, RunningAvgSamplesPerSec=2.628261622982385, CurrSamplesPerSec=2.5986505677045026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:42:53,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=5840, skipped=0, lr=[9.933647664350446e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:42:53,432] [INFO] [timer.py:259:stop] epoch=0/micro_step=5840/global_step=5840, RunningAvgSamplesPerSec=2.628254811306557, CurrSamplesPerSec=2.5850305737098234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:43:08,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=5850, skipped=0, lr=[9.933373809080652e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:43:08,719] [INFO] [timer.py:259:stop] epoch=0/micro_step=5850/global_step=5850, RunningAvgSamplesPerSec=2.628254857715935, CurrSamplesPerSec=2.601185626572595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:43:24,089] [INFO] [logging.py:96:log_dist] [Rank 0] step=5860, skipped=0, lr=[9.933099393624671e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:43:24,099] [INFO] [timer.py:259:stop] epoch=0/micro_step=5860/global_step=5860, RunningAvgSamplesPerSec=2.6282355413695404, CurrSamplesPerSec=2.5940315460565406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:43:39,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=5870, skipped=0, lr=[9.932824418013663e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:43:39,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=5870/global_step=5870, RunningAvgSamplesPerSec=2.628243176726216, CurrSamplesPerSec=2.6110239749318147, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:43:54,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=5880, skipped=0, lr=[9.932548882278849e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:43:54,696] [INFO] [timer.py:259:stop] epoch=0/micro_step=5880/global_step=5880, RunningAvgSamplesPerSec=2.6282346048213308, CurrSamplesPerSec=2.627494653220778, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:44:10,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=5890, skipped=0, lr=[9.932272786451518e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:44:10,030] [INFO] [timer.py:259:stop] epoch=0/micro_step=5890/global_step=5890, RunningAvgSamplesPerSec=2.6282227392165343, CurrSamplesPerSec=2.6475225673389158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:44:25,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=5900, skipped=0, lr=[9.93199613056302e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:44:25,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=5900/global_step=5900, RunningAvgSamplesPerSec=2.6282155172114026, CurrSamplesPerSec=2.6426251019741187, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:44:40,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=5910, skipped=0, lr=[9.93171891464477e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:44:40,636] [INFO] [timer.py:259:stop] epoch=0/micro_step=5910/global_step=5910, RunningAvgSamplesPerSec=2.628209138600167, CurrSamplesPerSec=2.6346080202243845, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:44:56,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=5920, skipped=0, lr=[9.931441138728245e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:44:56,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=5920/global_step=5920, RunningAvgSamplesPerSec=2.628182928703879, CurrSamplesPerSec=2.6207065320917597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:45:11,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=5930, skipped=0, lr=[9.931162802844988e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:45:11,352] [INFO] [timer.py:259:stop] epoch=0/micro_step=5930/global_step=5930, RunningAvgSamplesPerSec=2.628175830552827, CurrSamplesPerSec=2.6363654558288583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:45:26,592] [INFO] [logging.py:96:log_dist] [Rank 0] step=5940, skipped=0, lr=[9.930883907026606e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:45:26,611] [INFO] [timer.py:259:stop] epoch=0/micro_step=5940/global_step=5940, RunningAvgSamplesPerSec=2.6281866099106215, CurrSamplesPerSec=2.630759385252402, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:45:41,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=5950, skipped=0, lr=[9.930604451304762e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:45:41,978] [INFO] [timer.py:259:stop] epoch=0/micro_step=5950/global_step=5950, RunningAvgSamplesPerSec=2.628169375034674, CurrSamplesPerSec=2.5809218679309476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:45:57,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=5960, skipped=0, lr=[9.930324435711191e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:45:57,294] [INFO] [timer.py:259:stop] epoch=0/micro_step=5960/global_step=5960, RunningAvgSamplesPerSec=2.628167295797275, CurrSamplesPerSec=2.6425010661617687, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:46:12,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=5970, skipped=0, lr=[9.93004386027769e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:46:12,603] [INFO] [timer.py:259:stop] epoch=0/micro_step=5970/global_step=5970, RunningAvgSamplesPerSec=2.6281616385764046, CurrSamplesPerSec=2.63161852713856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:46:27,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=5980, skipped=0, lr=[9.929762725036119e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:46:27,958] [INFO] [timer.py:259:stop] epoch=0/micro_step=5980/global_step=5980, RunningAvgSamplesPerSec=2.6281489160031186, CurrSamplesPerSec=2.581341206002611, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:46:43,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=5990, skipped=0, lr=[9.929481030018397e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:46:43,266] [INFO] [timer.py:259:stop] epoch=0/micro_step=5990/global_step=5990, RunningAvgSamplesPerSec=2.628146104599578, CurrSamplesPerSec=2.6207982343648704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:46:58,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=6000, skipped=0, lr=[9.929198775256516e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:46:58,596] [INFO] [timer.py:259:stop] epoch=0/micro_step=6000/global_step=6000, RunningAvgSamplesPerSec=2.6281384868644397, CurrSamplesPerSec=2.581723732542697, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:47:13,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=6010, skipped=0, lr=[9.928915960782521e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:47:13,932] [INFO] [timer.py:259:stop] epoch=0/micro_step=6010/global_step=6010, RunningAvgSamplesPerSec=2.628133607683038, CurrSamplesPerSec=2.631924024952935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:47:29,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=6020, skipped=0, lr=[9.92863258662853e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:47:29,304] [INFO] [timer.py:259:stop] epoch=0/micro_step=6020/global_step=6020, RunningAvgSamplesPerSec=2.6281156707798217, CurrSamplesPerSec=2.6007626389536775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:47:44,663] [INFO] [logging.py:96:log_dist] [Rank 0] step=6030, skipped=0, lr=[9.928348652826718e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:47:44,669] [INFO] [timer.py:259:stop] epoch=0/micro_step=6030/global_step=6030, RunningAvgSamplesPerSec=2.62810088440141, CurrSamplesPerSec=2.620311548786645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:48:00,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=6040, skipped=0, lr=[9.928064159409324e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:48:00,061] [INFO] [timer.py:259:stop] epoch=0/micro_step=6040/global_step=6040, RunningAvgSamplesPerSec=2.62807977289863, CurrSamplesPerSec=2.6282643729166955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:48:15,391] [INFO] [logging.py:96:log_dist] [Rank 0] step=6050, skipped=0, lr=[9.927779106408657e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:48:15,410] [INFO] [timer.py:259:stop] epoch=0/micro_step=6050/global_step=6050, RunningAvgSamplesPerSec=2.6280731987158563, CurrSamplesPerSec=2.6263355739202128, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:48:30,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=6060, skipped=0, lr=[9.927493493857082e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:48:30,707] [INFO] [timer.py:259:stop] epoch=0/micro_step=6060/global_step=6060, RunningAvgSamplesPerSec=2.628070680376889, CurrSamplesPerSec=2.615712327998759, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:48:46,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=6070, skipped=0, lr=[9.927207321787028e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:48:46,029] [INFO] [timer.py:259:stop] epoch=0/micro_step=6070/global_step=6070, RunningAvgSamplesPerSec=2.6280643478464065, CurrSamplesPerSec=2.637624217267368, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:49:01,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=6080, skipped=0, lr=[9.926920590230995e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:49:01,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=6080/global_step=6080, RunningAvgSamplesPerSec=2.628058259697095, CurrSamplesPerSec=2.637016446462378, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:49:16,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=6090, skipped=0, lr=[9.926633299221537e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:49:16,657] [INFO] [timer.py:259:stop] epoch=0/micro_step=6090/global_step=6090, RunningAvgSamplesPerSec=2.6280589591792016, CurrSamplesPerSec=2.626162499051204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:49:31,943] [INFO] [logging.py:96:log_dist] [Rank 0] step=6100, skipped=0, lr=[9.92634544879128e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:49:31,944] [INFO] [timer.py:259:stop] epoch=0/micro_step=6100/global_step=6100, RunningAvgSamplesPerSec=2.6280609477521435, CurrSamplesPerSec=2.6341095755070008, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:49:47,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=6110, skipped=0, lr=[9.926057038972908e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:49:47,287] [INFO] [timer.py:259:stop] epoch=0/micro_step=6110/global_step=6110, RunningAvgSamplesPerSec=2.6280456485238357, CurrSamplesPerSec=2.63269675549541, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:50:02,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=6120, skipped=0, lr=[9.925768069799167e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:50:02,617] [INFO] [timer.py:259:stop] epoch=0/micro_step=6120/global_step=6120, RunningAvgSamplesPerSec=2.628037078885824, CurrSamplesPerSec=2.642513552461426, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:50:17,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=6130, skipped=0, lr=[9.925478541302873e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:50:17,999] [INFO] [timer.py:259:stop] epoch=0/micro_step=6130/global_step=6130, RunningAvgSamplesPerSec=2.628015471316548, CurrSamplesPerSec=2.529638168304523, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:50:33,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=6140, skipped=0, lr=[9.9251884535169e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:50:33,286] [INFO] [timer.py:259:stop] epoch=0/micro_step=6140/global_step=6140, RunningAvgSamplesPerSec=2.628016014874733, CurrSamplesPerSec=2.635962839610843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:50:48,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=6150, skipped=0, lr=[9.924897806474191e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:50:48,635] [INFO] [timer.py:259:stop] epoch=0/micro_step=6150/global_step=6150, RunningAvgSamplesPerSec=2.6280013287642148, CurrSamplesPerSec=2.577226394876429, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:51:03,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=6160, skipped=0, lr=[9.924606600207746e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:51:03,903] [INFO] [timer.py:259:stop] epoch=0/micro_step=6160/global_step=6160, RunningAvgSamplesPerSec=2.628008328839023, CurrSamplesPerSec=2.62588464032637, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:51:19,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=6170, skipped=0, lr=[9.924314834750633e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:51:19,220] [INFO] [timer.py:259:stop] epoch=0/micro_step=6170/global_step=6170, RunningAvgSamplesPerSec=2.6280053339445884, CurrSamplesPerSec=2.6325909996698575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:51:34,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=6180, skipped=0, lr=[9.92402251013598e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:51:34,543] [INFO] [timer.py:259:stop] epoch=0/micro_step=6180/global_step=6180, RunningAvgSamplesPerSec=2.628001354068234, CurrSamplesPerSec=2.6421315251240656, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:51:49,812] [INFO] [logging.py:96:log_dist] [Rank 0] step=6190, skipped=0, lr=[9.923729626396982e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:51:49,813] [INFO] [timer.py:259:stop] epoch=0/micro_step=6190/global_step=6190, RunningAvgSamplesPerSec=2.62800555965693, CurrSamplesPerSec=2.6455253612796126, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:52:05,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=6200, skipped=0, lr=[9.923436183566896e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:52:05,052] [INFO] [timer.py:259:stop] epoch=0/micro_step=6200/global_step=6200, RunningAvgSamplesPerSec=2.628019513701156, CurrSamplesPerSec=2.6333083229552035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:52:20,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=6210, skipped=0, lr=[9.923142181679041e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:52:20,331] [INFO] [timer.py:259:stop] epoch=0/micro_step=6210/global_step=6210, RunningAvgSamplesPerSec=2.628025732858736, CurrSamplesPerSec=2.6388883319997936, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:52:35,644] [INFO] [logging.py:96:log_dist] [Rank 0] step=6220, skipped=0, lr=[9.922847620766806e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:52:35,667] [INFO] [timer.py:259:stop] epoch=0/micro_step=6220/global_step=6220, RunningAvgSamplesPerSec=2.6280166191177345, CurrSamplesPerSec=2.612607267019291, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:52:50,934] [INFO] [logging.py:96:log_dist] [Rank 0] step=6230, skipped=0, lr=[9.922552500863634e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:52:50,936] [INFO] [timer.py:259:stop] epoch=0/micro_step=6230/global_step=6230, RunningAvgSamplesPerSec=2.6280210166728972, CurrSamplesPerSec=2.6303279627247877, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:53:06,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=6240, skipped=0, lr=[9.922256822003035e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:53:06,249] [INFO] [timer.py:259:stop] epoch=0/micro_step=6240/global_step=6240, RunningAvgSamplesPerSec=2.6280182648677584, CurrSamplesPerSec=2.6366844880929, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:53:21,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=6250, skipped=0, lr=[9.921960584218586e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:53:21,707] [INFO] [timer.py:259:stop] epoch=0/micro_step=6250/global_step=6250, RunningAvgSamplesPerSec=2.6279736549147135, CurrSamplesPerSec=2.6371474293120443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:53:37,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=6260, skipped=0, lr=[9.921663787543927e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:53:37,010] [INFO] [timer.py:259:stop] epoch=0/micro_step=6260/global_step=6260, RunningAvgSamplesPerSec=2.627973093986714, CurrSamplesPerSec=2.6390572766103935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:53:52,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=6270, skipped=0, lr=[9.921366432012753e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:53:52,331] [INFO] [timer.py:259:stop] epoch=0/micro_step=6270/global_step=6270, RunningAvgSamplesPerSec=2.627966480517877, CurrSamplesPerSec=2.633269058402243, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:54:07,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=6280, skipped=0, lr=[9.921068517658834e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:54:07,650] [INFO] [timer.py:259:stop] epoch=0/micro_step=6280/global_step=6280, RunningAvgSamplesPerSec=2.627960418601383, CurrSamplesPerSec=2.621584926824947, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:54:22,966] [INFO] [logging.py:96:log_dist] [Rank 0] step=6290, skipped=0, lr=[9.920770044515997e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:54:22,968] [INFO] [timer.py:259:stop] epoch=0/micro_step=6290/global_step=6290, RunningAvgSamplesPerSec=2.6279556264894945, CurrSamplesPerSec=2.63930969608159, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:54:38,307] [INFO] [logging.py:96:log_dist] [Rank 0] step=6300, skipped=0, lr=[9.920471012618133e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:54:38,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=6300/global_step=6300, RunningAvgSamplesPerSec=2.6279428272224523, CurrSamplesPerSec=2.5986859890020657, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:54:53,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=6310, skipped=0, lr=[9.920171421999198e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:54:53,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=6310/global_step=6310, RunningAvgSamplesPerSec=2.627946697391419, CurrSamplesPerSec=2.6342158669800595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:55:08,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=6320, skipped=0, lr=[9.91987127269321e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:55:08,899] [INFO] [timer.py:259:stop] epoch=0/micro_step=6320/global_step=6320, RunningAvgSamplesPerSec=2.6279403639870167, CurrSamplesPerSec=2.626151400016135, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:55:24,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=6330, skipped=0, lr=[9.91957056473425e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:55:24,236] [INFO] [timer.py:259:stop] epoch=0/micro_step=6330/global_step=6330, RunningAvgSamplesPerSec=2.6279292087601775, CurrSamplesPerSec=2.6160883847734544, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:55:39,568] [INFO] [logging.py:96:log_dist] [Rank 0] step=6340, skipped=0, lr=[9.919269298156467e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:55:39,569] [INFO] [timer.py:259:stop] epoch=0/micro_step=6340/global_step=6340, RunningAvgSamplesPerSec=2.627918859801301, CurrSamplesPerSec=2.630180338184053, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:55:54,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=6350, skipped=0, lr=[9.918967472994066e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:55:54,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=6350/global_step=6350, RunningAvgSamplesPerSec=2.6278969881526826, CurrSamplesPerSec=2.627758447525781, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:56:10,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=6360, skipped=0, lr=[9.918665089281323e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:56:10,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=6360/global_step=6360, RunningAvgSamplesPerSec=2.6278953276256156, CurrSamplesPerSec=2.6356659268212, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:56:25,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=6370, skipped=0, lr=[9.91836214705257e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:56:25,528] [INFO] [timer.py:259:stop] epoch=0/micro_step=6370/global_step=6370, RunningAvgSamplesPerSec=2.627898749501805, CurrSamplesPerSec=2.627819362318207, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:56:40,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=6380, skipped=0, lr=[9.918058646342208e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:56:40,843] [INFO] [timer.py:259:stop] epoch=0/micro_step=6380/global_step=6380, RunningAvgSamplesPerSec=2.627895736711501, CurrSamplesPerSec=2.6321903613139486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:56:56,176] [INFO] [logging.py:96:log_dist] [Rank 0] step=6390, skipped=0, lr=[9.9177545871847e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:56:56,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=6390/global_step=6390, RunningAvgSamplesPerSec=2.627881775179222, CurrSamplesPerSec=2.6390992047902517, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:57:11,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=6400, skipped=0, lr=[9.917449969614573e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:57:11,470] [INFO] [timer.py:259:stop] epoch=0/micro_step=6400/global_step=6400, RunningAvgSamplesPerSec=2.627882748608282, CurrSamplesPerSec=2.625163961488515, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:57:26,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=6410, skipped=0, lr=[9.917144793666413e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:57:26,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=6410/global_step=6410, RunningAvgSamplesPerSec=2.62788557442132, CurrSamplesPerSec=2.613536830297281, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:57:42,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=6420, skipped=0, lr=[9.916839059374875e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:57:42,059] [INFO] [timer.py:259:stop] epoch=0/micro_step=6420/global_step=6420, RunningAvgSamplesPerSec=2.6278770778354663, CurrSamplesPerSec=2.639779374595775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:57:57,366] [INFO] [logging.py:96:log_dist] [Rank 0] step=6430, skipped=0, lr=[9.916532766774676e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:57:57,378] [INFO] [timer.py:259:stop] epoch=0/micro_step=6430/global_step=6430, RunningAvgSamplesPerSec=2.627873356072074, CurrSamplesPerSec=2.6317456716663425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:58:12,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=6440, skipped=0, lr=[9.916225915900595e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:58:12,719] [INFO] [timer.py:259:stop] epoch=0/micro_step=6440/global_step=6440, RunningAvgSamplesPerSec=2.6278675832207132, CurrSamplesPerSec=2.6369244346465504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:58:28,052] [INFO] [logging.py:96:log_dist] [Rank 0] step=6450, skipped=0, lr=[9.915918506787475e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:58:28,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=6450/global_step=6450, RunningAvgSamplesPerSec=2.6278519258407136, CurrSamplesPerSec=2.6005973522972115, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:58:43,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=6460, skipped=0, lr=[9.91561053947022e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:58:43,393] [INFO] [timer.py:259:stop] epoch=0/micro_step=6460/global_step=6460, RunningAvgSamplesPerSec=2.627849225151205, CurrSamplesPerSec=2.6480528507552052, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:58:58,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=6470, skipped=0, lr=[9.915302013983803e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:58:58,727] [INFO] [timer.py:259:stop] epoch=0/micro_step=6470/global_step=6470, RunningAvgSamplesPerSec=2.627843152994734, CurrSamplesPerSec=2.608449840601269, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:59:14,024] [INFO] [logging.py:96:log_dist] [Rank 0] step=6480, skipped=0, lr=[9.914992930363256e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:59:14,026] [INFO] [timer.py:259:stop] epoch=0/micro_step=6480/global_step=6480, RunningAvgSamplesPerSec=2.6278454467383554, CurrSamplesPerSec=2.620251800023582, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:59:29,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=6490, skipped=0, lr=[9.914683288643677e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:59:29,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=6490/global_step=6490, RunningAvgSamplesPerSec=2.6278385208069768, CurrSamplesPerSec=2.6127533323322205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 02:59:44,677] [INFO] [logging.py:96:log_dist] [Rank 0] step=6500, skipped=0, lr=[9.914373088860221e-06], mom=[(0.9, 0.95)] +[2024-11-01 02:59:44,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=6500/global_step=6500, RunningAvgSamplesPerSec=2.6278391945522146, CurrSamplesPerSec=2.60262416822812, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:00:00,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=6510, skipped=0, lr=[9.914062331048118e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:00:00,042] [INFO] [timer.py:259:stop] epoch=0/micro_step=6510/global_step=6510, RunningAvgSamplesPerSec=2.6278239858577157, CurrSamplesPerSec=2.633797368064198, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:00:15,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=6520, skipped=0, lr=[9.91375101524265e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:00:15,278] [INFO] [timer.py:259:stop] epoch=0/micro_step=6520/global_step=6520, RunningAvgSamplesPerSec=2.6278392507682167, CurrSamplesPerSec=2.6289142520189346, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:00:30,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=6530, skipped=0, lr=[9.913439141479168e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:00:30,625] [INFO] [timer.py:259:stop] epoch=0/micro_step=6530/global_step=6530, RunningAvgSamplesPerSec=2.6278306725814815, CurrSamplesPerSec=2.6331454860300534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:00:46,007] [INFO] [logging.py:96:log_dist] [Rank 0] step=6540, skipped=0, lr=[9.913126709793089e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:00:46,008] [INFO] [timer.py:259:stop] epoch=0/micro_step=6540/global_step=6540, RunningAvgSamplesPerSec=2.6278144616634136, CurrSamplesPerSec=2.622762776348243, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:01:01,297] [INFO] [logging.py:96:log_dist] [Rank 0] step=6550, skipped=0, lr=[9.912813720219884e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:01:01,298] [INFO] [timer.py:259:stop] epoch=0/micro_step=6550/global_step=6550, RunningAvgSamplesPerSec=2.6278212948534163, CurrSamplesPerSec=2.6344168929828813, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:01:16,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=6560, skipped=0, lr=[9.912500172795096e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:01:16,616] [INFO] [timer.py:259:stop] epoch=0/micro_step=6560/global_step=6560, RunningAvgSamplesPerSec=2.6278194665192514, CurrSamplesPerSec=2.6374508952680777, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:01:31,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=6570, skipped=0, lr=[9.912186067554328e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:01:31,897] [INFO] [timer.py:259:stop] epoch=0/micro_step=6570/global_step=6570, RunningAvgSamplesPerSec=2.627829528000371, CurrSamplesPerSec=2.637456699942434, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:01:47,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=6580, skipped=0, lr=[9.911871404533247e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:01:47,228] [INFO] [timer.py:259:stop] epoch=0/micro_step=6580/global_step=6580, RunningAvgSamplesPerSec=2.6278273304057054, CurrSamplesPerSec=2.6350689908956535, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:02:02,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=6590, skipped=0, lr=[9.911556183767584e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:02:02,533] [INFO] [timer.py:259:stop] epoch=0/micro_step=6590/global_step=6590, RunningAvgSamplesPerSec=2.6278319441124007, CurrSamplesPerSec=2.639745731585323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:02:17,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=6600, skipped=0, lr=[9.91124040529313e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:02:17,785] [INFO] [timer.py:259:stop] epoch=0/micro_step=6600/global_step=6600, RunningAvgSamplesPerSec=2.6278449958659773, CurrSamplesPerSec=2.6269310270087685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:02:33,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=6610, skipped=0, lr=[9.910924069145742e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:02:33,063] [INFO] [timer.py:259:stop] epoch=0/micro_step=6610/global_step=6610, RunningAvgSamplesPerSec=2.6278557729939918, CurrSamplesPerSec=2.6681542979628565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:02:48,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=6620, skipped=0, lr=[9.910607175361342e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:02:48,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=6620/global_step=6620, RunningAvgSamplesPerSec=2.627869776696465, CurrSamplesPerSec=2.5946658080030818, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:03:03,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=6630, skipped=0, lr=[9.910289723975913e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:03:03,560] [INFO] [timer.py:259:stop] epoch=0/micro_step=6630/global_step=6630, RunningAvgSamplesPerSec=2.6278906961627064, CurrSamplesPerSec=2.6393914936693137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:03:18,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=6640, skipped=0, lr=[9.9099717150255e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:03:18,931] [INFO] [timer.py:259:stop] epoch=0/micro_step=6640/global_step=6640, RunningAvgSamplesPerSec=2.6278854694566953, CurrSamplesPerSec=2.6281626782097582, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:03:34,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=6650, skipped=0, lr=[9.909653148546217e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:03:34,173] [INFO] [timer.py:259:stop] epoch=0/micro_step=6650/global_step=6650, RunningAvgSamplesPerSec=2.62789969209542, CurrSamplesPerSec=2.6409840269264895, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:03:49,489] [INFO] [logging.py:96:log_dist] [Rank 0] step=6660, skipped=0, lr=[9.909334024574233e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:03:49,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=6660/global_step=6660, RunningAvgSamplesPerSec=2.6278914255928414, CurrSamplesPerSec=2.6372108528988094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:04:04,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=6670, skipped=0, lr=[9.909014343145786e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:04:04,786] [INFO] [timer.py:259:stop] epoch=0/micro_step=6670/global_step=6670, RunningAvgSamplesPerSec=2.6279035290837793, CurrSamplesPerSec=2.6376329254328996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:04:20,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=6680, skipped=0, lr=[9.908694104297177e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:04:20,123] [INFO] [timer.py:259:stop] epoch=0/micro_step=6680/global_step=6680, RunningAvgSamplesPerSec=2.6278995971113384, CurrSamplesPerSec=2.6483935309444706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:04:35,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=6690, skipped=0, lr=[9.908373308064768e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:04:35,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=6690/global_step=6690, RunningAvgSamplesPerSec=2.6279119641833137, CurrSamplesPerSec=2.61037844210051, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:04:50,752] [INFO] [logging.py:96:log_dist] [Rank 0] step=6700, skipped=0, lr=[9.908051954484984e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:04:50,756] [INFO] [timer.py:259:stop] epoch=0/micro_step=6700/global_step=6700, RunningAvgSamplesPerSec=2.6279097056065783, CurrSamplesPerSec=2.626806814923887, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:05:06,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=6710, skipped=0, lr=[9.907730043594318e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:05:06,086] [INFO] [timer.py:259:stop] epoch=0/micro_step=6710/global_step=6710, RunningAvgSamplesPerSec=2.627913109676744, CurrSamplesPerSec=2.6389381414299415, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:05:21,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=6720, skipped=0, lr=[9.907407575429322e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:05:21,328] [INFO] [timer.py:259:stop] epoch=0/micro_step=6720/global_step=6720, RunningAvgSamplesPerSec=2.6279276384755024, CurrSamplesPerSec=2.638180411261364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:05:36,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=6730, skipped=0, lr=[9.907084550026613e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:05:36,664] [INFO] [timer.py:259:stop] epoch=0/micro_step=6730/global_step=6730, RunningAvgSamplesPerSec=2.627923069282067, CurrSamplesPerSec=2.6298406172798625, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:05:52,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=6740, skipped=0, lr=[9.906760967422869e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:05:52,002] [INFO] [timer.py:259:stop] epoch=0/micro_step=6740/global_step=6740, RunningAvgSamplesPerSec=2.627913772511511, CurrSamplesPerSec=2.626682614584964, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:06:07,289] [INFO] [logging.py:96:log_dist] [Rank 0] step=6750, skipped=0, lr=[9.906436827654833e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:06:07,297] [INFO] [timer.py:259:stop] epoch=0/micro_step=6750/global_step=6750, RunningAvgSamplesPerSec=2.6279149013265424, CurrSamplesPerSec=2.6361802868004793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:06:22,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=6760, skipped=0, lr=[9.906112130759311e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:06:22,576] [INFO] [timer.py:259:stop] epoch=0/micro_step=6760/global_step=6760, RunningAvgSamplesPerSec=2.6279217453616366, CurrSamplesPerSec=2.643069728668751, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:06:37,836] [INFO] [logging.py:96:log_dist] [Rank 0] step=6770, skipped=0, lr=[9.905786876773176e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:06:37,838] [INFO] [timer.py:259:stop] epoch=0/micro_step=6770/global_step=6770, RunningAvgSamplesPerSec=2.6279300974949678, CurrSamplesPerSec=2.5950017193240593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:06:53,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=6780, skipped=0, lr=[9.905461065733355e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:06:53,148] [INFO] [timer.py:259:stop] epoch=0/micro_step=6780/global_step=6780, RunningAvgSamplesPerSec=2.6279313179137227, CurrSamplesPerSec=2.6129535375096875, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:07:08,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=6790, skipped=0, lr=[9.90513469767685e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:07:08,423] [INFO] [timer.py:259:stop] epoch=0/micro_step=6790/global_step=6790, RunningAvgSamplesPerSec=2.627937573108875, CurrSamplesPerSec=2.593043662195718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:07:23,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=6800, skipped=0, lr=[9.904807772640715e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:07:23,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=6800/global_step=6800, RunningAvgSamplesPerSec=2.6279378739353665, CurrSamplesPerSec=2.6359421322255776, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:07:39,007] [INFO] [logging.py:96:log_dist] [Rank 0] step=6810, skipped=0, lr=[9.904480290662075e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:07:39,021] [INFO] [timer.py:259:stop] epoch=0/micro_step=6810/global_step=6810, RunningAvgSamplesPerSec=2.627940204172215, CurrSamplesPerSec=2.6030154522246733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:07:54,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=6820, skipped=0, lr=[9.904152251778115e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:07:54,348] [INFO] [timer.py:259:stop] epoch=0/micro_step=6820/global_step=6820, RunningAvgSamplesPerSec=2.6279323693356917, CurrSamplesPerSec=2.62631871764529, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:08:09,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=6830, skipped=0, lr=[9.903823656026085e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:08:09,661] [INFO] [timer.py:259:stop] epoch=0/micro_step=6830/global_step=6830, RunningAvgSamplesPerSec=2.6279289617750177, CurrSamplesPerSec=2.595054702526204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:08:24,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=6840, skipped=0, lr=[9.903494503443297e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:08:24,902] [INFO] [timer.py:259:stop] epoch=0/micro_step=6840/global_step=6840, RunningAvgSamplesPerSec=2.62794145667817, CurrSamplesPerSec=2.629598660885573, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:08:40,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=6850, skipped=0, lr=[9.903164794067123e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:08:40,143] [INFO] [timer.py:259:stop] epoch=0/micro_step=6850/global_step=6850, RunningAvgSamplesPerSec=2.6279529681636355, CurrSamplesPerSec=2.63309134927991, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:08:55,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=6860, skipped=0, lr=[9.902834527935006e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:08:55,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=6860/global_step=6860, RunningAvgSamplesPerSec=2.6279620844344964, CurrSamplesPerSec=2.636528691263569, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:09:10,759] [INFO] [logging.py:96:log_dist] [Rank 0] step=6870, skipped=0, lr=[9.902503705084446e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:09:10,761] [INFO] [timer.py:259:stop] epoch=0/micro_step=6870/global_step=6870, RunningAvgSamplesPerSec=2.627945731684539, CurrSamplesPerSec=2.6217074164898015, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:09:26,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=6880, skipped=0, lr=[9.902172325553008e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:09:26,047] [INFO] [timer.py:259:stop] epoch=0/micro_step=6880/global_step=6880, RunningAvgSamplesPerSec=2.6279472112430504, CurrSamplesPerSec=2.6406697722362726, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:09:41,378] [INFO] [logging.py:96:log_dist] [Rank 0] step=6890, skipped=0, lr=[9.901840389378322e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:09:41,380] [INFO] [timer.py:259:stop] epoch=0/micro_step=6890/global_step=6890, RunningAvgSamplesPerSec=2.627941389481808, CurrSamplesPerSec=2.6328091303704224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:09:56,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=6900, skipped=0, lr=[9.901507896598075e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:09:56,713] [INFO] [timer.py:259:stop] epoch=0/micro_step=6900/global_step=6900, RunningAvgSamplesPerSec=2.627931129360259, CurrSamplesPerSec=2.626840951511756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:10:12,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=6910, skipped=0, lr=[9.901174847250026e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:10:12,015] [INFO] [timer.py:259:stop] epoch=0/micro_step=6910/global_step=6910, RunningAvgSamplesPerSec=2.627929643104093, CurrSamplesPerSec=2.639698798941611, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:10:27,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=6920, skipped=0, lr=[9.900841241371992e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:10:27,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=6920/global_step=6920, RunningAvgSamplesPerSec=2.627944658583033, CurrSamplesPerSec=2.6418415410947955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:10:42,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=6930, skipped=0, lr=[9.900507079001854e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:10:42,488] [INFO] [timer.py:259:stop] epoch=0/micro_step=6930/global_step=6930, RunningAvgSamplesPerSec=2.6279595983845816, CurrSamplesPerSec=2.6511958648484324, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:10:57,772] [INFO] [logging.py:96:log_dist] [Rank 0] step=6940, skipped=0, lr=[9.900172360177553e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:10:57,773] [INFO] [timer.py:259:stop] epoch=0/micro_step=6940/global_step=6940, RunningAvgSamplesPerSec=2.627962559074177, CurrSamplesPerSec=2.623865766236126, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:11:13,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=6950, skipped=0, lr=[9.899837084937102e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:11:13,028] [INFO] [timer.py:259:stop] epoch=0/micro_step=6950/global_step=6950, RunningAvgSamplesPerSec=2.627972619137866, CurrSamplesPerSec=2.6264091684842765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:11:28,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=6960, skipped=0, lr=[9.899501253318567e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:11:28,317] [INFO] [timer.py:259:stop] epoch=0/micro_step=6960/global_step=6960, RunningAvgSamplesPerSec=2.6279759385428014, CurrSamplesPerSec=2.629769303527674, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:11:43,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=6970, skipped=0, lr=[9.899164865360086e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:11:43,577] [INFO] [timer.py:259:stop] epoch=0/micro_step=6970/global_step=6970, RunningAvgSamplesPerSec=2.627984644854834, CurrSamplesPerSec=2.63770673981198, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:11:58,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=6980, skipped=0, lr=[9.898827921099849e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:11:58,924] [INFO] [timer.py:259:stop] epoch=0/micro_step=6980/global_step=6980, RunningAvgSamplesPerSec=2.6279711441724647, CurrSamplesPerSec=2.5847410412649645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:12:14,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=6990, skipped=0, lr=[9.898490420576125e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:12:14,164] [INFO] [timer.py:259:stop] epoch=0/micro_step=6990/global_step=6990, RunningAvgSamplesPerSec=2.6279843481416205, CurrSamplesPerSec=2.6395401540417534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:12:29,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=7000, skipped=0, lr=[9.89815236382723e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:12:29,442] [INFO] [timer.py:259:stop] epoch=0/micro_step=7000/global_step=7000, RunningAvgSamplesPerSec=2.627989669969224, CurrSamplesPerSec=2.6362685184622823, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:12:44,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=7010, skipped=0, lr=[9.897813750891556e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:12:44,683] [INFO] [timer.py:259:stop] epoch=0/micro_step=7010/global_step=7010, RunningAvgSamplesPerSec=2.6280004307824187, CurrSamplesPerSec=2.6418174132983117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:12:59,944] [INFO] [logging.py:96:log_dist] [Rank 0] step=7020, skipped=0, lr=[9.89747458180755e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:12:59,967] [INFO] [timer.py:259:stop] epoch=0/micro_step=7020/global_step=7020, RunningAvgSamplesPerSec=2.6280010060913295, CurrSamplesPerSec=2.6564742488770143, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:13:15,221] [INFO] [logging.py:96:log_dist] [Rank 0] step=7030, skipped=0, lr=[9.897134856613722e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:13:15,222] [INFO] [timer.py:259:stop] epoch=0/micro_step=7030/global_step=7030, RunningAvgSamplesPerSec=2.628010160836415, CurrSamplesPerSec=2.636399427020201, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:13:30,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=7040, skipped=0, lr=[9.89679457534865e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:13:30,559] [INFO] [timer.py:259:stop] epoch=0/micro_step=7040/global_step=7040, RunningAvgSamplesPerSec=2.6280023520753844, CurrSamplesPerSec=2.6187050104508263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:13:45,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=7050, skipped=0, lr=[9.896453738050977e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:13:45,799] [INFO] [timer.py:259:stop] epoch=0/micro_step=7050/global_step=7050, RunningAvgSamplesPerSec=2.628014759287023, CurrSamplesPerSec=2.635085959680304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:14:01,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=7060, skipped=0, lr=[9.8961123447594e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:14:01,081] [INFO] [timer.py:259:stop] epoch=0/micro_step=7060/global_step=7060, RunningAvgSamplesPerSec=2.6280184477590276, CurrSamplesPerSec=2.6394263733704224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:14:16,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=7070, skipped=0, lr=[9.895770395512687e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:14:16,336] [INFO] [timer.py:259:stop] epoch=0/micro_step=7070/global_step=7070, RunningAvgSamplesPerSec=2.628024516087361, CurrSamplesPerSec=2.6380393704540555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:14:31,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=7080, skipped=0, lr=[9.895427890349667e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:14:31,554] [INFO] [timer.py:259:stop] epoch=0/micro_step=7080/global_step=7080, RunningAvgSamplesPerSec=2.6280411670869492, CurrSamplesPerSec=2.6439960983176607, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:14:46,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=7090, skipped=0, lr=[9.895084829309229e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:14:46,932] [INFO] [timer.py:259:stop] epoch=0/micro_step=7090/global_step=7090, RunningAvgSamplesPerSec=2.6280210249928686, CurrSamplesPerSec=2.6375325776785377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:15:02,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=7100, skipped=0, lr=[9.894741212430328e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:15:02,224] [INFO] [timer.py:259:stop] epoch=0/micro_step=7100/global_step=7100, RunningAvgSamplesPerSec=2.6280221374119677, CurrSamplesPerSec=2.6337969545934836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:15:17,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=7110, skipped=0, lr=[9.894397039751984e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:15:17,560] [INFO] [timer.py:259:stop] epoch=0/micro_step=7110/global_step=7110, RunningAvgSamplesPerSec=2.62801185631793, CurrSamplesPerSec=2.620478941680365, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:15:32,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=7120, skipped=0, lr=[9.894052311313276e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:15:32,864] [INFO] [timer.py:259:stop] epoch=0/micro_step=7120/global_step=7120, RunningAvgSamplesPerSec=2.6280132870824997, CurrSamplesPerSec=2.637530919102115, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:15:48,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=7130, skipped=0, lr=[9.893707027153348e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:15:48,149] [INFO] [timer.py:259:stop] epoch=0/micro_step=7130/global_step=7130, RunningAvgSamplesPerSec=2.628017106680264, CurrSamplesPerSec=2.581935898931293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:16:03,444] [INFO] [logging.py:96:log_dist] [Rank 0] step=7140, skipped=0, lr=[9.893361187311408e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:16:03,448] [INFO] [timer.py:259:stop] epoch=0/micro_step=7140/global_step=7140, RunningAvgSamplesPerSec=2.6280180679447063, CurrSamplesPerSec=2.6109800897118753, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:16:18,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=7150, skipped=0, lr=[9.893014791826728e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:16:18,723] [INFO] [timer.py:259:stop] epoch=0/micro_step=7150/global_step=7150, RunningAvgSamplesPerSec=2.628024091922513, CurrSamplesPerSec=2.581867556886737, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:16:34,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=7160, skipped=0, lr=[9.892667840738636e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:16:34,042] [INFO] [timer.py:259:stop] epoch=0/micro_step=7160/global_step=7160, RunningAvgSamplesPerSec=2.6280231538282197, CurrSamplesPerSec=2.634230343136166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:16:49,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=7170, skipped=0, lr=[9.892320334086535e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:16:49,311] [INFO] [timer.py:259:stop] epoch=0/micro_step=7170/global_step=7170, RunningAvgSamplesPerSec=2.628028562908995, CurrSamplesPerSec=2.6261781200743104, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:17:04,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=7180, skipped=0, lr=[9.89197227190988e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:17:04,577] [INFO] [timer.py:259:stop] epoch=0/micro_step=7180/global_step=7180, RunningAvgSamplesPerSec=2.6280354969794635, CurrSamplesPerSec=2.6420462290955284, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:17:19,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=7190, skipped=0, lr=[9.891623654248194e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:17:19,879] [INFO] [timer.py:259:stop] epoch=0/micro_step=7190/global_step=7190, RunningAvgSamplesPerSec=2.6280326963887215, CurrSamplesPerSec=2.6296744993257364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:17:35,168] [INFO] [logging.py:96:log_dist] [Rank 0] step=7200, skipped=0, lr=[9.891274481141064e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:17:35,179] [INFO] [timer.py:259:stop] epoch=0/micro_step=7200/global_step=7200, RunningAvgSamplesPerSec=2.6280340777457685, CurrSamplesPerSec=2.6382667024848367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:17:50,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=7210, skipped=0, lr=[9.89092475262814e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:17:50,468] [INFO] [timer.py:259:stop] epoch=0/micro_step=7210/global_step=7210, RunningAvgSamplesPerSec=2.628037085821521, CurrSamplesPerSec=2.6336675446389477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:18:05,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=7220, skipped=0, lr=[9.89057446874913e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:18:05,790] [INFO] [timer.py:259:stop] epoch=0/micro_step=7220/global_step=7220, RunningAvgSamplesPerSec=2.6280403642330987, CurrSamplesPerSec=2.6244461413628044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:18:21,096] [INFO] [logging.py:96:log_dist] [Rank 0] step=7230, skipped=0, lr=[9.890223629543811e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:18:21,097] [INFO] [timer.py:259:stop] epoch=0/micro_step=7230/global_step=7230, RunningAvgSamplesPerSec=2.628044625783972, CurrSamplesPerSec=2.6417383770514924, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:18:36,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=7240, skipped=0, lr=[9.88987223505202e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:18:36,423] [INFO] [timer.py:259:stop] epoch=0/micro_step=7240/global_step=7240, RunningAvgSamplesPerSec=2.6280445883873442, CurrSamplesPerSec=2.5922187306250914, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:18:51,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=7250, skipped=0, lr=[9.88952028531366e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:18:51,817] [INFO] [timer.py:259:stop] epoch=0/micro_step=7250/global_step=7250, RunningAvgSamplesPerSec=2.6280319832840666, CurrSamplesPerSec=2.641132866752933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:19:07,120] [INFO] [logging.py:96:log_dist] [Rank 0] step=7260, skipped=0, lr=[9.889167780368693e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:19:07,122] [INFO] [timer.py:259:stop] epoch=0/micro_step=7260/global_step=7260, RunningAvgSamplesPerSec=2.628032515904104, CurrSamplesPerSec=2.63411577905376, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:19:22,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=7270, skipped=0, lr=[9.888814720257147e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:19:22,422] [INFO] [timer.py:259:stop] epoch=0/micro_step=7270/global_step=7270, RunningAvgSamplesPerSec=2.6280330957128606, CurrSamplesPerSec=2.6311439077432164, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:19:37,707] [INFO] [logging.py:96:log_dist] [Rank 0] step=7280, skipped=0, lr=[9.888461105019111e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:19:37,733] [INFO] [timer.py:259:stop] epoch=0/micro_step=7280/global_step=7280, RunningAvgSamplesPerSec=2.6280290483546414, CurrSamplesPerSec=2.6089933916917216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:19:53,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=7290, skipped=0, lr=[9.888106934694741e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:19:53,062] [INFO] [timer.py:259:stop] epoch=0/micro_step=7290/global_step=7290, RunningAvgSamplesPerSec=2.6280223192110417, CurrSamplesPerSec=2.6279609589634334, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:20:08,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=7300, skipped=0, lr=[9.88775220932425e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:20:08,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=7300/global_step=7300, RunningAvgSamplesPerSec=2.6280156467866926, CurrSamplesPerSec=2.601875443841526, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:20:23,693] [INFO] [logging.py:96:log_dist] [Rank 0] step=7310, skipped=0, lr=[9.887396928947917e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:20:23,701] [INFO] [timer.py:259:stop] epoch=0/micro_step=7310/global_step=7310, RunningAvgSamplesPerSec=2.628015936740295, CurrSamplesPerSec=2.631462915654867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:20:38,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=7320, skipped=0, lr=[9.887041093606086e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:20:38,977] [INFO] [timer.py:259:stop] epoch=0/micro_step=7320/global_step=7320, RunningAvgSamplesPerSec=2.6280208345851728, CurrSamplesPerSec=2.603362820377906, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:20:54,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=7330, skipped=0, lr=[9.886684703339164e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:20:54,220] [INFO] [timer.py:259:stop] epoch=0/micro_step=7330/global_step=7330, RunningAvgSamplesPerSec=2.6280327681698767, CurrSamplesPerSec=2.6374583584255142, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:21:09,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=7340, skipped=0, lr=[9.886327758187615e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:21:09,529] [INFO] [timer.py:259:stop] epoch=0/micro_step=7340/global_step=7340, RunningAvgSamplesPerSec=2.6280331827794843, CurrSamplesPerSec=2.6295879449369672, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:21:24,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=7350, skipped=0, lr=[9.885970258191973e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:21:24,821] [INFO] [timer.py:259:stop] epoch=0/micro_step=7350/global_step=7350, RunningAvgSamplesPerSec=2.6280326961261857, CurrSamplesPerSec=2.6128595349165002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:21:40,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=7360, skipped=0, lr=[9.88561220339283e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:21:40,167] [INFO] [timer.py:259:stop] epoch=0/micro_step=7360/global_step=7360, RunningAvgSamplesPerSec=2.628022913116031, CurrSamplesPerSec=2.646685995687982, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:21:55,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=7370, skipped=0, lr=[9.885253593830845e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:21:55,473] [INFO] [timer.py:259:stop] epoch=0/micro_step=7370/global_step=7370, RunningAvgSamplesPerSec=2.628022196059656, CurrSamplesPerSec=2.6406149100386087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:22:10,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=7380, skipped=0, lr=[9.88489442954674e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:22:10,765] [INFO] [timer.py:259:stop] epoch=0/micro_step=7380/global_step=7380, RunningAvgSamplesPerSec=2.628021363832663, CurrSamplesPerSec=2.6530749362669166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:22:26,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=7390, skipped=0, lr=[9.884534710581292e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:22:26,052] [INFO] [timer.py:259:stop] epoch=0/micro_step=7390/global_step=7390, RunningAvgSamplesPerSec=2.6280226659290453, CurrSamplesPerSec=2.639762760656277, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:22:41,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=7400, skipped=0, lr=[9.884174436975356e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:22:41,308] [INFO] [timer.py:259:stop] epoch=0/micro_step=7400/global_step=7400, RunningAvgSamplesPerSec=2.6280333381812087, CurrSamplesPerSec=2.635490378064569, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:22:56,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=7410, skipped=0, lr=[9.883813608769834e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:22:56,557] [INFO] [timer.py:259:stop] epoch=0/micro_step=7410/global_step=7410, RunningAvgSamplesPerSec=2.6280426496268863, CurrSamplesPerSec=2.6431829908339703, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:23:11,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=7420, skipped=0, lr=[9.8834522260057e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:23:11,843] [INFO] [timer.py:259:stop] epoch=0/micro_step=7420/global_step=7420, RunningAvgSamplesPerSec=2.6280465463347507, CurrSamplesPerSec=2.625498366436613, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:23:27,119] [INFO] [logging.py:96:log_dist] [Rank 0] step=7430, skipped=0, lr=[9.88309028872399e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:23:27,121] [INFO] [timer.py:259:stop] epoch=0/micro_step=7430/global_step=7430, RunningAvgSamplesPerSec=2.6280491870276568, CurrSamplesPerSec=2.6356758642426215, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:23:42,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=7440, skipped=0, lr=[9.882727796965804e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:23:42,417] [INFO] [timer.py:259:stop] epoch=0/micro_step=7440/global_step=7440, RunningAvgSamplesPerSec=2.62804842693571, CurrSamplesPerSec=2.643827354049867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:23:57,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=7450, skipped=0, lr=[9.8823647507723e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:23:57,666] [INFO] [timer.py:259:stop] epoch=0/micro_step=7450/global_step=7450, RunningAvgSamplesPerSec=2.62805596344777, CurrSamplesPerSec=2.615313956299243, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:24:12,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=7460, skipped=0, lr=[9.882001150184702e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:24:12,907] [INFO] [timer.py:259:stop] epoch=0/micro_step=7460/global_step=7460, RunningAvgSamplesPerSec=2.628065582260101, CurrSamplesPerSec=2.631932282635028, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:24:28,186] [INFO] [logging.py:96:log_dist] [Rank 0] step=7470, skipped=0, lr=[9.881636995244298e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:24:28,195] [INFO] [timer.py:259:stop] epoch=0/micro_step=7470/global_step=7470, RunningAvgSamplesPerSec=2.6280682062887974, CurrSamplesPerSec=2.6058257316709046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:24:43,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=7480, skipped=0, lr=[9.881272285992438e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:24:43,525] [INFO] [timer.py:259:stop] epoch=0/micro_step=7480/global_step=7480, RunningAvgSamplesPerSec=2.628062870100664, CurrSamplesPerSec=2.5965973814387295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:24:58,808] [INFO] [logging.py:96:log_dist] [Rank 0] step=7490, skipped=0, lr=[9.880907022470535e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:24:58,809] [INFO] [timer.py:259:stop] epoch=0/micro_step=7490/global_step=7490, RunningAvgSamplesPerSec=2.628063983579299, CurrSamplesPerSec=2.5605566857529896, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:25:14,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=7500, skipped=0, lr=[9.880541204720064e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:25:14,074] [INFO] [timer.py:259:stop] epoch=0/micro_step=7500/global_step=7500, RunningAvgSamplesPerSec=2.628069462891606, CurrSamplesPerSec=2.6440235994247177, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:25:29,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=7510, skipped=0, lr=[9.880174832782564e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:25:29,329] [INFO] [timer.py:259:stop] epoch=0/micro_step=7510/global_step=7510, RunningAvgSamplesPerSec=2.6280765514710995, CurrSamplesPerSec=2.6242031242126416, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:25:44,624] [INFO] [logging.py:96:log_dist] [Rank 0] step=7520, skipped=0, lr=[9.87980790669964e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:25:44,625] [INFO] [timer.py:259:stop] epoch=0/micro_step=7520/global_step=7520, RunningAvgSamplesPerSec=2.6280793605434805, CurrSamplesPerSec=2.6387335196927366, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:25:59,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=7530, skipped=0, lr=[9.87944042651295e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:25:59,974] [INFO] [timer.py:259:stop] epoch=0/micro_step=7530/global_step=7530, RunningAvgSamplesPerSec=2.6280674611475483, CurrSamplesPerSec=2.6363306569796197, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:26:15,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=7540, skipped=0, lr=[9.879072392264224e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:26:15,278] [INFO] [timer.py:259:stop] epoch=0/micro_step=7540/global_step=7540, RunningAvgSamplesPerSec=2.6280696953604403, CurrSamplesPerSec=2.626868096901648, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:26:30,605] [INFO] [logging.py:96:log_dist] [Rank 0] step=7550, skipped=0, lr=[9.878703803995255e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:26:30,606] [INFO] [timer.py:259:stop] epoch=0/micro_step=7550/global_step=7550, RunningAvgSamplesPerSec=2.6280642282241637, CurrSamplesPerSec=2.611377955035569, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:26:45,900] [INFO] [logging.py:96:log_dist] [Rank 0] step=7560, skipped=0, lr=[9.878334661747895e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:26:45,902] [INFO] [timer.py:259:stop] epoch=0/micro_step=7560/global_step=7560, RunningAvgSamplesPerSec=2.628067694837179, CurrSamplesPerSec=2.5753836123451443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:27:01,252] [INFO] [logging.py:96:log_dist] [Rank 0] step=7570, skipped=0, lr=[9.87796496556406e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:27:01,253] [INFO] [timer.py:259:stop] epoch=0/micro_step=7570/global_step=7570, RunningAvgSamplesPerSec=2.6280566874026507, CurrSamplesPerSec=2.631959120459696, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:27:16,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=7580, skipped=0, lr=[9.877594715485727e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:27:16,546] [INFO] [timer.py:259:stop] epoch=0/micro_step=7580/global_step=7580, RunningAvgSamplesPerSec=2.6280597661055296, CurrSamplesPerSec=2.638819017005263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:27:31,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=7590, skipped=0, lr=[9.877223911554942e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:27:31,883] [INFO] [timer.py:259:stop] epoch=0/micro_step=7590/global_step=7590, RunningAvgSamplesPerSec=2.628052296403647, CurrSamplesPerSec=2.639098374516335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:27:47,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=7600, skipped=0, lr=[9.876852553813808e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:27:47,146] [INFO] [timer.py:259:stop] epoch=0/micro_step=7600/global_step=7600, RunningAvgSamplesPerSec=2.6280599388044372, CurrSamplesPerSec=2.638483285128538, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:28:02,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=7610, skipped=0, lr=[9.87648064230449e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:28:02,460] [INFO] [timer.py:259:stop] epoch=0/micro_step=7610/global_step=7610, RunningAvgSamplesPerSec=2.628057522537813, CurrSamplesPerSec=2.621047991842453, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:28:17,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=7620, skipped=0, lr=[9.876108177069226e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:28:17,682] [INFO] [timer.py:259:stop] epoch=0/micro_step=7620/global_step=7620, RunningAvgSamplesPerSec=2.6280741642945227, CurrSamplesPerSec=2.635141420159395, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:28:32,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=7630, skipped=0, lr=[9.875735158150301e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:28:32,964] [INFO] [timer.py:259:stop] epoch=0/micro_step=7630/global_step=7630, RunningAvgSamplesPerSec=2.6280759910914226, CurrSamplesPerSec=2.6346539445378, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:28:48,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=7640, skipped=0, lr=[9.875361585590077e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:28:48,327] [INFO] [timer.py:259:stop] epoch=0/micro_step=7640/global_step=7640, RunningAvgSamplesPerSec=2.628061393347394, CurrSamplesPerSec=2.5791297019265333, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:29:03,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=7650, skipped=0, lr=[9.874987459430971e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:29:03,610] [INFO] [timer.py:259:stop] epoch=0/micro_step=7650/global_step=7650, RunningAvgSamplesPerSec=2.6280690187271984, CurrSamplesPerSec=2.6303415714076923, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:29:18,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=7660, skipped=0, lr=[9.874612779715466e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:29:18,933] [INFO] [timer.py:259:stop] epoch=0/micro_step=7660/global_step=7660, RunningAvgSamplesPerSec=2.628063677829896, CurrSamplesPerSec=2.6044382295554125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:29:34,204] [INFO] [logging.py:96:log_dist] [Rank 0] step=7670, skipped=0, lr=[9.874237546486107e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:29:34,206] [INFO] [timer.py:259:stop] epoch=0/micro_step=7670/global_step=7670, RunningAvgSamplesPerSec=2.628069487573021, CurrSamplesPerSec=2.6291527863607844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:29:49,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=7680, skipped=0, lr=[9.873861759785501e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:29:49,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=7680/global_step=7680, RunningAvgSamplesPerSec=2.628064669660325, CurrSamplesPerSec=2.6212736333720157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:30:04,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=7690, skipped=0, lr=[9.87348541965632e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:30:04,877] [INFO] [timer.py:259:stop] epoch=0/micro_step=7690/global_step=7690, RunningAvgSamplesPerSec=2.6280550333811736, CurrSamplesPerSec=2.6350611273866273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:30:20,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=7700, skipped=0, lr=[9.873108526141295e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:30:20,152] [INFO] [timer.py:259:stop] epoch=0/micro_step=7700/global_step=7700, RunningAvgSamplesPerSec=2.6280578439301054, CurrSamplesPerSec=2.622299133087324, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:30:35,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=7710, skipped=0, lr=[9.872731079283227e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:30:35,471] [INFO] [timer.py:259:stop] epoch=0/micro_step=7710/global_step=7710, RunningAvgSamplesPerSec=2.628052842012511, CurrSamplesPerSec=2.6310807756312204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:30:50,777] [INFO] [logging.py:96:log_dist] [Rank 0] step=7720, skipped=0, lr=[9.872353079124972e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:30:50,778] [INFO] [timer.py:259:stop] epoch=0/micro_step=7720/global_step=7720, RunningAvgSamplesPerSec=2.628052045605365, CurrSamplesPerSec=2.635077682197075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:31:06,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=7730, skipped=0, lr=[9.87197452570945e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:31:06,087] [INFO] [timer.py:259:stop] epoch=0/micro_step=7730/global_step=7730, RunningAvgSamplesPerSec=2.6280545585696014, CurrSamplesPerSec=2.6236729119627062, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:31:21,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=7740, skipped=0, lr=[9.87159541907965e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:31:21,383] [INFO] [timer.py:259:stop] epoch=0/micro_step=7740/global_step=7740, RunningAvgSamplesPerSec=2.6280618726799614, CurrSamplesPerSec=2.64056960888165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:31:36,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=7750, skipped=0, lr=[9.871215759278618e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:31:36,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=7750/global_step=7750, RunningAvgSamplesPerSec=2.628076230775869, CurrSamplesPerSec=2.641728393861986, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:31:51,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=7760, skipped=0, lr=[9.870835546349466e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:31:51,944] [INFO] [timer.py:259:stop] epoch=0/micro_step=7760/global_step=7760, RunningAvgSamplesPerSec=2.6280725852719904, CurrSamplesPerSec=2.6331773079067315, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:32:07,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=7770, skipped=0, lr=[9.870454780335363e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:32:07,199] [INFO] [timer.py:259:stop] epoch=0/micro_step=7770/global_step=7770, RunningAvgSamplesPerSec=2.628081410899127, CurrSamplesPerSec=2.6218467163053107, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:32:22,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=7780, skipped=0, lr=[9.87007346127955e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:32:22,499] [INFO] [timer.py:259:stop] epoch=0/micro_step=7780/global_step=7780, RunningAvgSamplesPerSec=2.6280812553386026, CurrSamplesPerSec=2.648978953115363, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:32:37,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=7790, skipped=0, lr=[9.869691589225324e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:32:37,878] [INFO] [timer.py:259:stop] epoch=0/micro_step=7790/global_step=7790, RunningAvgSamplesPerSec=2.6280632109673214, CurrSamplesPerSec=2.5786448922459333, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:32:53,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=7800, skipped=0, lr=[9.869309164216046e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:32:53,192] [INFO] [timer.py:259:stop] epoch=0/micro_step=7800/global_step=7800, RunningAvgSamplesPerSec=2.6280568825375443, CurrSamplesPerSec=2.6303819859021145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:33:08,485] [INFO] [logging.py:96:log_dist] [Rank 0] step=7810, skipped=0, lr=[9.86892618629514e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:33:08,488] [INFO] [timer.py:259:stop] epoch=0/micro_step=7810/global_step=7810, RunningAvgSamplesPerSec=2.6280549906768442, CurrSamplesPerSec=2.57518121734069, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:33:23,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=7820, skipped=0, lr=[9.868542655506094e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:33:23,855] [INFO] [timer.py:259:stop] epoch=0/micro_step=7820/global_step=7820, RunningAvgSamplesPerSec=2.628041622940325, CurrSamplesPerSec=2.635160873268516, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:33:39,094] [INFO] [logging.py:96:log_dist] [Rank 0] step=7830, skipped=0, lr=[9.86815857189246e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:33:39,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=7830/global_step=7830, RunningAvgSamplesPerSec=2.62804862645054, CurrSamplesPerSec=2.6167812322170407, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:33:54,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=7840, skipped=0, lr=[9.86777393549785e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:33:54,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=7840/global_step=7840, RunningAvgSamplesPerSec=2.6280526643150703, CurrSamplesPerSec=2.63412777266035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:34:09,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=7850, skipped=0, lr=[9.867388746365936e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:34:09,725] [INFO] [timer.py:259:stop] epoch=0/micro_step=7850/global_step=7850, RunningAvgSamplesPerSec=2.6280458621527796, CurrSamplesPerSec=2.621990948181559, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:34:25,028] [INFO] [logging.py:96:log_dist] [Rank 0] step=7860, skipped=0, lr=[9.86700300454046e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:34:25,030] [INFO] [timer.py:259:stop] epoch=0/micro_step=7860/global_step=7860, RunningAvgSamplesPerSec=2.6280497636930087, CurrSamplesPerSec=2.6527380839502284, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:34:40,348] [INFO] [logging.py:96:log_dist] [Rank 0] step=7870, skipped=0, lr=[9.866616710065223e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:34:40,349] [INFO] [timer.py:259:stop] epoch=0/micro_step=7870/global_step=7870, RunningAvgSamplesPerSec=2.6280470222767613, CurrSamplesPerSec=2.6368336725248778, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:34:55,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=7880, skipped=0, lr=[9.866229862984087e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:34:55,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=7880/global_step=7880, RunningAvgSamplesPerSec=2.628043295918117, CurrSamplesPerSec=2.6256598479718796, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:35:11,106] [INFO] [logging.py:96:log_dist] [Rank 0] step=7890, skipped=0, lr=[9.86584246334098e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:35:11,108] [INFO] [timer.py:259:stop] epoch=0/micro_step=7890/global_step=7890, RunningAvgSamplesPerSec=2.6280311612585665, CurrSamplesPerSec=2.623242579868709, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:35:26,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=7900, skipped=0, lr=[9.86545451117989e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:35:26,449] [INFO] [timer.py:259:stop] epoch=0/micro_step=7900/global_step=7900, RunningAvgSamplesPerSec=2.6280251288692305, CurrSamplesPerSec=2.6178612224225186, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:35:41,769] [INFO] [logging.py:96:log_dist] [Rank 0] step=7910, skipped=0, lr=[9.865066006544873e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:35:41,771] [INFO] [timer.py:259:stop] epoch=0/micro_step=7910/global_step=7910, RunningAvgSamplesPerSec=2.62802452070189, CurrSamplesPerSec=2.651600214689375, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:35:57,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=7920, skipped=0, lr=[9.864676949480039e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:35:57,124] [INFO] [timer.py:259:stop] epoch=0/micro_step=7920/global_step=7920, RunningAvgSamplesPerSec=2.6280203182632693, CurrSamplesPerSec=2.626309261780922, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:36:12,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=7930, skipped=0, lr=[9.864287340029568e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:36:12,477] [INFO] [timer.py:259:stop] epoch=0/micro_step=7930/global_step=7930, RunningAvgSamplesPerSec=2.628010913747039, CurrSamplesPerSec=2.630302807651351, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:36:27,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=7940, skipped=0, lr=[9.863897178237698e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:36:27,798] [INFO] [timer.py:259:stop] epoch=0/micro_step=7940/global_step=7940, RunningAvgSamplesPerSec=2.628005997968321, CurrSamplesPerSec=2.5984931961152395, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:36:43,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=7950, skipped=0, lr=[9.863506464148735e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:36:43,068] [INFO] [timer.py:259:stop] epoch=0/micro_step=7950/global_step=7950, RunningAvgSamplesPerSec=2.6280125789568336, CurrSamplesPerSec=2.6371387243519506, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:36:58,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=7960, skipped=0, lr=[9.863115197807045e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:36:58,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=7960/global_step=7960, RunningAvgSamplesPerSec=2.628013347383941, CurrSamplesPerSec=2.623212638274974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:37:13,631] [INFO] [logging.py:96:log_dist] [Rank 0] step=7970, skipped=0, lr=[9.862723379257053e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:37:13,632] [INFO] [timer.py:259:stop] epoch=0/micro_step=7970/global_step=7970, RunningAvgSamplesPerSec=2.6280168920020075, CurrSamplesPerSec=2.637377095207364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:37:28,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=7980, skipped=0, lr=[9.862331008543254e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:37:28,917] [INFO] [timer.py:259:stop] epoch=0/micro_step=7980/global_step=7980, RunningAvgSamplesPerSec=2.6280186396753815, CurrSamplesPerSec=2.5713145552869663, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:37:44,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=7990, skipped=0, lr=[9.861938085710197e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:37:44,162] [INFO] [timer.py:259:stop] epoch=0/micro_step=7990/global_step=7990, RunningAvgSamplesPerSec=2.6280276020749147, CurrSamplesPerSec=2.637999549897385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:37:59,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=8000, skipped=0, lr=[9.861544610802503e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:37:59,436] [INFO] [timer.py:259:stop] epoch=0/micro_step=8000/global_step=8000, RunningAvgSamplesPerSec=2.6280295831262253, CurrSamplesPerSec=2.616128362458439, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:38:14,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=8010, skipped=0, lr=[9.861150583864848e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:38:14,713] [INFO] [timer.py:259:stop] epoch=0/micro_step=8010/global_step=8010, RunningAvgSamplesPerSec=2.628034509141006, CurrSamplesPerSec=2.6240323823271043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:38:29,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=8020, skipped=0, lr=[9.860756004941978e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:38:29,983] [INFO] [timer.py:259:stop] epoch=0/micro_step=8020/global_step=8020, RunningAvgSamplesPerSec=2.6280409830351115, CurrSamplesPerSec=2.630520559311908, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:38:45,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=8030, skipped=0, lr=[9.860360874078694e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:38:45,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=8030/global_step=8030, RunningAvgSamplesPerSec=2.6280460377387693, CurrSamplesPerSec=2.6449919196223326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:39:00,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=8040, skipped=0, lr=[9.859965191319865e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:39:00,558] [INFO] [timer.py:259:stop] epoch=0/micro_step=8040/global_step=8040, RunningAvgSamplesPerSec=2.628042496023965, CurrSamplesPerSec=2.6476261836491326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:39:15,847] [INFO] [logging.py:96:log_dist] [Rank 0] step=8050, skipped=0, lr=[9.859568956710418e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:39:15,848] [INFO] [timer.py:259:stop] epoch=0/micro_step=8050/global_step=8050, RunningAvgSamplesPerSec=2.6280434597777433, CurrSamplesPerSec=2.6508021089443314, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:39:31,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=8060, skipped=0, lr=[9.85917217029535e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:39:31,139] [INFO] [timer.py:259:stop] epoch=0/micro_step=8060/global_step=8060, RunningAvgSamplesPerSec=2.6280443736795047, CurrSamplesPerSec=2.630631511048099, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:39:46,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=8070, skipped=0, lr=[9.858774832119713e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:39:46,350] [INFO] [timer.py:259:stop] epoch=0/micro_step=8070/global_step=8070, RunningAvgSamplesPerSec=2.628061481655742, CurrSamplesPerSec=2.63014941335448, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:40:01,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=8080, skipped=0, lr=[9.858376942228625e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:40:01,647] [INFO] [timer.py:259:stop] epoch=0/micro_step=8080/global_step=8080, RunningAvgSamplesPerSec=2.6280617340727424, CurrSamplesPerSec=2.63984001724969, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:40:16,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=8090, skipped=0, lr=[9.857978500667266e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:40:16,885] [INFO] [timer.py:259:stop] epoch=0/micro_step=8090/global_step=8090, RunningAvgSamplesPerSec=2.6280728633044785, CurrSamplesPerSec=2.658112316678739, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:40:32,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=8100, skipped=0, lr=[9.857579507480884e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:40:32,266] [INFO] [timer.py:259:stop] epoch=0/micro_step=8100/global_step=8100, RunningAvgSamplesPerSec=2.6280555178723746, CurrSamplesPerSec=2.6383048715573585, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:40:47,510] [INFO] [logging.py:96:log_dist] [Rank 0] step=8110, skipped=0, lr=[9.85717996271478e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:40:47,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=8110/global_step=8110, RunningAvgSamplesPerSec=2.62806416179712, CurrSamplesPerSec=2.640833541142056, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:41:02,813] [INFO] [logging.py:96:log_dist] [Rank 0] step=8120, skipped=0, lr=[9.856779866414323e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:41:02,817] [INFO] [timer.py:259:stop] epoch=0/micro_step=8120/global_step=8120, RunningAvgSamplesPerSec=2.6280604142997186, CurrSamplesPerSec=2.636038217240765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:41:18,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=8130, skipped=0, lr=[9.856379218624945e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:41:18,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=8130/global_step=8130, RunningAvgSamplesPerSec=2.628062129107134, CurrSamplesPerSec=2.6206549523832097, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:41:33,416] [INFO] [logging.py:96:log_dist] [Rank 0] step=8140, skipped=0, lr=[9.85597801939214e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:41:33,417] [INFO] [timer.py:259:stop] epoch=0/micro_step=8140/global_step=8140, RunningAvgSamplesPerSec=2.628059306405175, CurrSamplesPerSec=2.6287383658199808, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:41:48,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=8150, skipped=0, lr=[9.855576268761464e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:41:48,824] [INFO] [timer.py:259:stop] epoch=0/micro_step=8150/global_step=8150, RunningAvgSamplesPerSec=2.6280354815261675, CurrSamplesPerSec=2.6148130039074995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:42:04,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=8160, skipped=0, lr=[9.855173966778535e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:42:04,085] [INFO] [timer.py:259:stop] epoch=0/micro_step=8160/global_step=8160, RunningAvgSamplesPerSec=2.628042852921144, CurrSamplesPerSec=2.6327000605018696, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:42:19,397] [INFO] [logging.py:96:log_dist] [Rank 0] step=8170, skipped=0, lr=[9.854771113489037e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:42:19,419] [INFO] [timer.py:259:stop] epoch=0/micro_step=8170/global_step=8170, RunningAvgSamplesPerSec=2.6280346705080313, CurrSamplesPerSec=2.598828488977656, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:42:34,721] [INFO] [logging.py:96:log_dist] [Rank 0] step=8180, skipped=0, lr=[9.854367708938711e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:42:34,733] [INFO] [timer.py:259:stop] epoch=0/micro_step=8180/global_step=8180, RunningAvgSamplesPerSec=2.628032769765781, CurrSamplesPerSec=2.632109422393087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:42:50,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=8190, skipped=0, lr=[9.853963753173365e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:42:50,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=8190/global_step=8190, RunningAvgSamplesPerSec=2.6280288337102635, CurrSamplesPerSec=2.6252732294105865, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:43:05,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=8200, skipped=0, lr=[9.853559246238869e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:43:05,381] [INFO] [timer.py:259:stop] epoch=0/micro_step=8200/global_step=8200, RunningAvgSamplesPerSec=2.6280248396297283, CurrSamplesPerSec=2.6496035492847145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:43:20,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=8210, skipped=0, lr=[9.853154188181153e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:43:20,697] [INFO] [timer.py:259:stop] epoch=0/micro_step=8210/global_step=8210, RunningAvgSamplesPerSec=2.62802266548133, CurrSamplesPerSec=2.64062571604877, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:43:35,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=8220, skipped=0, lr=[9.852748579046215e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:43:35,962] [INFO] [timer.py:259:stop] epoch=0/micro_step=8220/global_step=8220, RunningAvgSamplesPerSec=2.628032565534779, CurrSamplesPerSec=2.646981220461021, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:43:51,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=8230, skipped=0, lr=[9.852342418880107e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:43:51,343] [INFO] [timer.py:259:stop] epoch=0/micro_step=8230/global_step=8230, RunningAvgSamplesPerSec=2.6280215950003902, CurrSamplesPerSec=2.6333649486356276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:44:06,693] [INFO] [logging.py:96:log_dist] [Rank 0] step=8240, skipped=0, lr=[9.851935707728952e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:44:06,696] [INFO] [timer.py:259:stop] epoch=0/micro_step=8240/global_step=8240, RunningAvgSamplesPerSec=2.6280154896311143, CurrSamplesPerSec=2.6216586650513083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:44:22,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=8250, skipped=0, lr=[9.851528445638933e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:44:22,054] [INFO] [timer.py:259:stop] epoch=0/micro_step=8250/global_step=8250, RunningAvgSamplesPerSec=2.6280088728963373, CurrSamplesPerSec=2.646331979842757, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:44:37,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=8260, skipped=0, lr=[9.851120632656292e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:44:37,357] [INFO] [timer.py:259:stop] epoch=0/micro_step=8260/global_step=8260, RunningAvgSamplesPerSec=2.6280112383458785, CurrSamplesPerSec=2.6315739469015615, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:44:52,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=8270, skipped=0, lr=[9.850712268827339e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:44:52,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=8270/global_step=8270, RunningAvgSamplesPerSec=2.6280012535094532, CurrSamplesPerSec=2.614994775672593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:45:08,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=8280, skipped=0, lr=[9.85030335419844e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:45:08,024] [INFO] [timer.py:259:stop] epoch=0/micro_step=8280/global_step=8280, RunningAvgSamplesPerSec=2.6280037425790557, CurrSamplesPerSec=2.6339623532394683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:45:23,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=8290, skipped=0, lr=[9.849893888816033e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:45:23,261] [INFO] [timer.py:259:stop] epoch=0/micro_step=8290/global_step=8290, RunningAvgSamplesPerSec=2.628018283583522, CurrSamplesPerSec=2.6378369617668276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:45:38,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=8300, skipped=0, lr=[9.849483872726609e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:45:38,630] [INFO] [timer.py:259:stop] epoch=0/micro_step=8300/global_step=8300, RunningAvgSamplesPerSec=2.628004764928839, CurrSamplesPerSec=2.563160461678848, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:45:53,952] [INFO] [logging.py:96:log_dist] [Rank 0] step=8310, skipped=0, lr=[9.849073305976727e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:45:53,954] [INFO] [timer.py:259:stop] epoch=0/micro_step=8310/global_step=8310, RunningAvgSamplesPerSec=2.628004716954404, CurrSamplesPerSec=2.63793401451527, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:46:09,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=8320, skipped=0, lr=[9.848662188613003e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:46:09,297] [INFO] [timer.py:259:stop] epoch=0/micro_step=8320/global_step=8320, RunningAvgSamplesPerSec=2.6279972307884085, CurrSamplesPerSec=2.599718460919546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:46:24,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=8330, skipped=0, lr=[9.848250520682125e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:46:24,606] [INFO] [timer.py:259:stop] epoch=0/micro_step=8330/global_step=8330, RunningAvgSamplesPerSec=2.627994190352749, CurrSamplesPerSec=2.648414852496443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:46:39,943] [INFO] [logging.py:96:log_dist] [Rank 0] step=8340, skipped=0, lr=[9.847838302230837e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:46:39,952] [INFO] [timer.py:259:stop] epoch=0/micro_step=8340/global_step=8340, RunningAvgSamplesPerSec=2.627987138182287, CurrSamplesPerSec=2.639898170356643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:46:55,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=8350, skipped=0, lr=[9.847425533305943e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:46:55,278] [INFO] [timer.py:259:stop] epoch=0/micro_step=8350/global_step=8350, RunningAvgSamplesPerSec=2.627988790989489, CurrSamplesPerSec=2.6483655207999055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:47:10,644] [INFO] [logging.py:96:log_dist] [Rank 0] step=8360, skipped=0, lr=[9.847012213954316e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:47:10,645] [INFO] [timer.py:259:stop] epoch=0/micro_step=8360/global_step=8360, RunningAvgSamplesPerSec=2.6279775154012035, CurrSamplesPerSec=2.641426438038867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:47:25,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=8370, skipped=0, lr=[9.846598344222888e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:47:25,972] [INFO] [timer.py:259:stop] epoch=0/micro_step=8370/global_step=8370, RunningAvgSamplesPerSec=2.627974980288012, CurrSamplesPerSec=2.57064123942543, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:47:41,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=8380, skipped=0, lr=[9.846183924158654e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:47:41,283] [INFO] [timer.py:259:stop] epoch=0/micro_step=8380/global_step=8380, RunningAvgSamplesPerSec=2.627975337950436, CurrSamplesPerSec=2.637477845757948, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:47:56,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=8390, skipped=0, lr=[9.845768953808673e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:47:56,567] [INFO] [timer.py:259:stop] epoch=0/micro_step=8390/global_step=8390, RunningAvgSamplesPerSec=2.627983422597273, CurrSamplesPerSec=2.6471202953991693, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:48:11,896] [INFO] [logging.py:96:log_dist] [Rank 0] step=8400, skipped=0, lr=[9.845353433220061e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:48:11,909] [INFO] [timer.py:259:stop] epoch=0/micro_step=8400/global_step=8400, RunningAvgSamplesPerSec=2.627977424717467, CurrSamplesPerSec=2.6465707631529765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:48:27,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=8410, skipped=0, lr=[9.844937362440003e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:48:27,192] [INFO] [timer.py:259:stop] epoch=0/micro_step=8410/global_step=8410, RunningAvgSamplesPerSec=2.62798486118396, CurrSamplesPerSec=2.6371797625241125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:48:42,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=8420, skipped=0, lr=[9.844520741515746e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:48:42,500] [INFO] [timer.py:259:stop] epoch=0/micro_step=8420/global_step=8420, RunningAvgSamplesPerSec=2.6279834922180267, CurrSamplesPerSec=2.6420429005816763, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:48:57,754] [INFO] [logging.py:96:log_dist] [Rank 0] step=8430, skipped=0, lr=[9.844103570494594e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:48:57,758] [INFO] [timer.py:259:stop] epoch=0/micro_step=8430/global_step=8430, RunningAvgSamplesPerSec=2.627991762681054, CurrSamplesPerSec=2.592478293085989, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:49:13,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=8440, skipped=0, lr=[9.843685849423919e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:49:13,118] [INFO] [timer.py:259:stop] epoch=0/micro_step=8440/global_step=8440, RunningAvgSamplesPerSec=2.6279907866658165, CurrSamplesPerSec=2.611671859879963, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:49:28,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=8450, skipped=0, lr=[9.843267578351153e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:49:28,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=8450/global_step=8450, RunningAvgSamplesPerSec=2.627986817689908, CurrSamplesPerSec=2.5893445906018955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:49:43,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=8460, skipped=0, lr=[9.842848757323789e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:49:43,751] [INFO] [timer.py:259:stop] epoch=0/micro_step=8460/global_step=8460, RunningAvgSamplesPerSec=2.627989756553572, CurrSamplesPerSec=2.636168688711378, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:49:59,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=8470, skipped=0, lr=[9.842429386389387e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:49:59,048] [INFO] [timer.py:259:stop] epoch=0/micro_step=8470/global_step=8470, RunningAvgSamplesPerSec=2.6279934914148564, CurrSamplesPerSec=2.601098921046348, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:50:14,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=8480, skipped=0, lr=[9.842009465595564e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:50:14,333] [INFO] [timer.py:259:stop] epoch=0/micro_step=8480/global_step=8480, RunningAvgSamplesPerSec=2.627997949010111, CurrSamplesPerSec=2.6383546589639195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:50:29,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=8490, skipped=0, lr=[9.841588994990006e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:50:29,666] [INFO] [timer.py:259:stop] epoch=0/micro_step=8490/global_step=8490, RunningAvgSamplesPerSec=2.6279894075201677, CurrSamplesPerSec=2.6227590862321875, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:50:45,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=8500, skipped=0, lr=[9.841167974620454e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:50:45,072] [INFO] [timer.py:259:stop] epoch=0/micro_step=8500/global_step=8500, RunningAvgSamplesPerSec=2.6279726593371695, CurrSamplesPerSec=2.633322789138307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:51:00,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=8510, skipped=0, lr=[9.840746404534716e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:51:00,345] [INFO] [timer.py:259:stop] epoch=0/micro_step=8510/global_step=8510, RunningAvgSamplesPerSec=2.627982192002988, CurrSamplesPerSec=2.623690554898879, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:51:15,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=8520, skipped=0, lr=[9.84032428478066e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:51:15,639] [INFO] [timer.py:259:stop] epoch=0/micro_step=8520/global_step=8520, RunningAvgSamplesPerSec=2.6279856411400964, CurrSamplesPerSec=2.624063163477181, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:51:30,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=8530, skipped=0, lr=[9.839901615406223e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:51:30,975] [INFO] [timer.py:259:stop] epoch=0/micro_step=8530/global_step=8530, RunningAvgSamplesPerSec=2.6279799641617214, CurrSamplesPerSec=2.6331802008427503, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:51:46,291] [INFO] [logging.py:96:log_dist] [Rank 0] step=8540, skipped=0, lr=[9.839478396459391e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:51:46,292] [INFO] [timer.py:259:stop] epoch=0/micro_step=8540/global_step=8540, RunningAvgSamplesPerSec=2.6279788960539237, CurrSamplesPerSec=2.631469106741667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:52:01,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=8550, skipped=0, lr=[9.83905462798823e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:52:01,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=8550/global_step=8550, RunningAvgSamplesPerSec=2.627976788763472, CurrSamplesPerSec=2.630504886586883, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:52:16,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=8560, skipped=0, lr=[9.838630310040852e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:52:16,922] [INFO] [timer.py:259:stop] epoch=0/micro_step=8560/global_step=8560, RunningAvgSamplesPerSec=2.627978100246945, CurrSamplesPerSec=2.647476193311218, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:52:32,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=8570, skipped=0, lr=[9.838205442665441e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:52:32,288] [INFO] [timer.py:259:stop] epoch=0/micro_step=8570/global_step=8570, RunningAvgSamplesPerSec=2.6279704857138255, CurrSamplesPerSec=2.6544521798487826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:52:47,549] [INFO] [logging.py:96:log_dist] [Rank 0] step=8580, skipped=0, lr=[9.837780025910241e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:52:47,570] [INFO] [timer.py:259:stop] epoch=0/micro_step=8580/global_step=8580, RunningAvgSamplesPerSec=2.627977665713818, CurrSamplesPerSec=2.657024114790074, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:53:02,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=8590, skipped=0, lr=[9.837354059823558e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:53:02,886] [INFO] [timer.py:259:stop] epoch=0/micro_step=8590/global_step=8590, RunningAvgSamplesPerSec=2.6279779914683257, CurrSamplesPerSec=2.630022423012616, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:53:18,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=8600, skipped=0, lr=[9.836927544453759e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:53:18,175] [INFO] [timer.py:259:stop] epoch=0/micro_step=8600/global_step=8600, RunningAvgSamplesPerSec=2.6279845637480133, CurrSamplesPerSec=2.5988095685896617, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:53:33,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=8610, skipped=0, lr=[9.836500479849279e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:53:33,469] [INFO] [timer.py:259:stop] epoch=0/micro_step=8610/global_step=8610, RunningAvgSamplesPerSec=2.6279865956154183, CurrSamplesPerSec=2.6134525562804485, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:53:48,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=8620, skipped=0, lr=[9.836072866058606e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:53:48,766] [INFO] [timer.py:259:stop] epoch=0/micro_step=8620/global_step=8620, RunningAvgSamplesPerSec=2.6279895400249624, CurrSamplesPerSec=2.610410934549453, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:54:04,080] [INFO] [logging.py:96:log_dist] [Rank 0] step=8630, skipped=0, lr=[9.8356447031303e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:54:04,100] [INFO] [timer.py:259:stop] epoch=0/micro_step=8630/global_step=8630, RunningAvgSamplesPerSec=2.6279842442557784, CurrSamplesPerSec=2.63512238122432, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:54:19,403] [INFO] [logging.py:96:log_dist] [Rank 0] step=8640, skipped=0, lr=[9.835215991112977e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:54:19,407] [INFO] [timer.py:259:stop] epoch=0/micro_step=8640/global_step=8640, RunningAvgSamplesPerSec=2.6279848037915468, CurrSamplesPerSec=2.556698718058929, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:54:34,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=8650, skipped=0, lr=[9.834786730055317e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:54:34,798] [INFO] [timer.py:259:stop] epoch=0/micro_step=8650/global_step=8650, RunningAvgSamplesPerSec=2.627973778725525, CurrSamplesPerSec=2.6267237392243508, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:54:50,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=8660, skipped=0, lr=[9.834356920006065e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:54:50,179] [INFO] [timer.py:259:stop] epoch=0/micro_step=8660/global_step=8660, RunningAvgSamplesPerSec=2.6279595122935, CurrSamplesPerSec=2.635508180292744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:55:05,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=8670, skipped=0, lr=[9.833926561014024e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:55:05,517] [INFO] [timer.py:259:stop] epoch=0/micro_step=8670/global_step=8670, RunningAvgSamplesPerSec=2.627957048254392, CurrSamplesPerSec=2.6374824066645717, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:55:20,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=8680, skipped=0, lr=[9.833495653128062e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:55:20,829] [INFO] [timer.py:259:stop] epoch=0/micro_step=8680/global_step=8680, RunningAvgSamplesPerSec=2.6279597521325333, CurrSamplesPerSec=2.6408476744362765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:55:36,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=8690, skipped=0, lr=[9.833064196397109e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:55:36,105] [INFO] [timer.py:259:stop] epoch=0/micro_step=8690/global_step=8690, RunningAvgSamplesPerSec=2.6279639310930727, CurrSamplesPerSec=2.6365676387447983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:55:51,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=8700, skipped=0, lr=[9.832632190870156e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:55:51,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=8700/global_step=8700, RunningAvgSamplesPerSec=2.6279574146425313, CurrSamplesPerSec=2.635083890304621, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:56:06,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=8710, skipped=0, lr=[9.832199636596257e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:56:06,784] [INFO] [timer.py:259:stop] epoch=0/micro_step=8710/global_step=8710, RunningAvgSamplesPerSec=2.6279523405439744, CurrSamplesPerSec=2.625019791044436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:56:22,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=8720, skipped=0, lr=[9.831766533624534e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:56:22,010] [INFO] [timer.py:259:stop] epoch=0/micro_step=8720/global_step=8720, RunningAvgSamplesPerSec=2.6279638666990777, CurrSamplesPerSec=2.6360216503590803, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:56:37,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=8730, skipped=0, lr=[9.831332882004159e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:56:37,361] [INFO] [timer.py:259:stop] epoch=0/micro_step=8730/global_step=8730, RunningAvgSamplesPerSec=2.627957996782803, CurrSamplesPerSec=2.6428344908527768, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:56:52,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=8740, skipped=0, lr=[9.830898681784375e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:56:52,661] [INFO] [timer.py:259:stop] epoch=0/micro_step=8740/global_step=8740, RunningAvgSamplesPerSec=2.6279607306641033, CurrSamplesPerSec=2.6474594823480824, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:57:07,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=8750, skipped=0, lr=[9.83046393301449e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:57:07,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=8750/global_step=8750, RunningAvgSamplesPerSec=2.6279611656669113, CurrSamplesPerSec=2.591217018706374, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:57:23,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=8760, skipped=0, lr=[9.830028635743864e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:57:23,278] [INFO] [timer.py:259:stop] epoch=0/micro_step=8760/global_step=8760, RunningAvgSamplesPerSec=2.627959337018533, CurrSamplesPerSec=2.6528349779070646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:57:38,551] [INFO] [logging.py:96:log_dist] [Rank 0] step=8770, skipped=0, lr=[9.829592790021929e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:57:38,556] [INFO] [timer.py:259:stop] epoch=0/micro_step=8770/global_step=8770, RunningAvgSamplesPerSec=2.6279615160328036, CurrSamplesPerSec=2.624714251342225, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:57:53,868] [INFO] [logging.py:96:log_dist] [Rank 0] step=8780, skipped=0, lr=[9.829156395898174e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:57:53,870] [INFO] [timer.py:259:stop] epoch=0/micro_step=8780/global_step=8780, RunningAvgSamplesPerSec=2.627961775191439, CurrSamplesPerSec=2.644319064070637, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:58:09,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=8790, skipped=0, lr=[9.82871945342215e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:58:09,281] [INFO] [timer.py:259:stop] epoch=0/micro_step=8790/global_step=8790, RunningAvgSamplesPerSec=2.627943118848376, CurrSamplesPerSec=2.6375873118707256, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:58:24,640] [INFO] [logging.py:96:log_dist] [Rank 0] step=8800, skipped=0, lr=[9.828281962643475e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:58:24,643] [INFO] [timer.py:259:stop] epoch=0/micro_step=8800/global_step=8800, RunningAvgSamplesPerSec=2.6279370832215507, CurrSamplesPerSec=2.6379136909126903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:58:39,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=8810, skipped=0, lr=[9.827843923611826e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:58:39,962] [INFO] [timer.py:259:stop] epoch=0/micro_step=8810/global_step=8810, RunningAvgSamplesPerSec=2.627934877845375, CurrSamplesPerSec=2.61955220566219, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:58:55,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=8820, skipped=0, lr=[9.82740533637694e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:58:55,234] [INFO] [timer.py:259:stop] epoch=0/micro_step=8820/global_step=8820, RunningAvgSamplesPerSec=2.627941312054596, CurrSamplesPerSec=2.6376507566179246, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:59:10,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=8830, skipped=0, lr=[9.82696620098862e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:59:10,596] [INFO] [timer.py:259:stop] epoch=0/micro_step=8830/global_step=8830, RunningAvgSamplesPerSec=2.6279291988948277, CurrSamplesPerSec=2.6487409896029974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:59:25,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=8840, skipped=0, lr=[9.826526517496732e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:59:25,885] [INFO] [timer.py:259:stop] epoch=0/micro_step=8840/global_step=8840, RunningAvgSamplesPerSec=2.6279316797411174, CurrSamplesPerSec=2.6333343621992276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:59:41,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=8850, skipped=0, lr=[9.8260862859512e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:59:41,168] [INFO] [timer.py:259:stop] epoch=0/micro_step=8850/global_step=8850, RunningAvgSamplesPerSec=2.6279323778223764, CurrSamplesPerSec=2.63547216208004, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 03:59:56,436] [INFO] [logging.py:96:log_dist] [Rank 0] step=8860, skipped=0, lr=[9.825645506402011e-06], mom=[(0.9, 0.95)] +[2024-11-01 03:59:56,437] [INFO] [timer.py:259:stop] epoch=0/micro_step=8860/global_step=8860, RunningAvgSamplesPerSec=2.627938981147682, CurrSamplesPerSec=2.618709915409153, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:00:11,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=8870, skipped=0, lr=[9.82520417889922e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:00:11,775] [INFO] [timer.py:259:stop] epoch=0/micro_step=8870/global_step=8870, RunningAvgSamplesPerSec=2.6279350716966445, CurrSamplesPerSec=2.6293629301895267, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:00:27,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=8880, skipped=0, lr=[9.824762303492934e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:00:27,077] [INFO] [timer.py:259:stop] epoch=0/micro_step=8880/global_step=8880, RunningAvgSamplesPerSec=2.627934186665866, CurrSamplesPerSec=2.6381372677663735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:00:42,429] [INFO] [logging.py:96:log_dist] [Rank 0] step=8890, skipped=0, lr=[9.824319880233332e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:00:42,431] [INFO] [timer.py:259:stop] epoch=0/micro_step=8890/global_step=8890, RunningAvgSamplesPerSec=2.6279258641746157, CurrSamplesPerSec=2.651074374486055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:00:57,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=8900, skipped=0, lr=[9.823876909170653e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:00:57,696] [INFO] [timer.py:259:stop] epoch=0/micro_step=8900/global_step=8900, RunningAvgSamplesPerSec=2.6279318025363607, CurrSamplesPerSec=2.6018734262986345, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:01:13,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=8910, skipped=0, lr=[9.823433390355193e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:01:13,110] [INFO] [timer.py:259:stop] epoch=0/micro_step=8910/global_step=8910, RunningAvgSamplesPerSec=2.62791574075781, CurrSamplesPerSec=2.60378382556446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:01:28,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=8920, skipped=0, lr=[9.822989323837315e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:01:28,463] [INFO] [timer.py:259:stop] epoch=0/micro_step=8920/global_step=8920, RunningAvgSamplesPerSec=2.627905976578736, CurrSamplesPerSec=2.6287445440932147, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:01:43,760] [INFO] [logging.py:96:log_dist] [Rank 0] step=8930, skipped=0, lr=[9.822544709667442e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:01:43,761] [INFO] [timer.py:259:stop] epoch=0/micro_step=8930/global_step=8930, RunningAvgSamplesPerSec=2.6279065428677333, CurrSamplesPerSec=2.625153281653285, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:01:59,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=8940, skipped=0, lr=[9.822099547896065e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:01:59,061] [INFO] [timer.py:259:stop] epoch=0/micro_step=8940/global_step=8940, RunningAvgSamplesPerSec=2.6279095592116173, CurrSamplesPerSec=2.6039955917240105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:02:14,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=8950, skipped=0, lr=[9.821653838573723e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:02:14,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=8950/global_step=8950, RunningAvgSamplesPerSec=2.6279092846555367, CurrSamplesPerSec=2.6164453716420075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:02:29,727] [INFO] [logging.py:96:log_dist] [Rank 0] step=8960, skipped=0, lr=[9.821207581751035e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:02:29,728] [INFO] [timer.py:259:stop] epoch=0/micro_step=8960/global_step=8960, RunningAvgSamplesPerSec=2.6279060141956863, CurrSamplesPerSec=2.6434287031627353, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:02:45,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=8970, skipped=0, lr=[9.820760777478669e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:02:45,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=8970/global_step=8970, RunningAvgSamplesPerSec=2.6279052460628756, CurrSamplesPerSec=2.63385856316136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:03:00,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=8980, skipped=0, lr=[9.82031342580736e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:03:00,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=8980/global_step=8980, RunningAvgSamplesPerSec=2.6279077840889187, CurrSamplesPerSec=2.645075320859468, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:03:15,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=8990, skipped=0, lr=[9.819865526787908e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:03:15,617] [INFO] [timer.py:259:stop] epoch=0/micro_step=8990/global_step=8990, RunningAvgSamplesPerSec=2.6279126870704594, CurrSamplesPerSec=2.630616661946136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:03:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=9000, skipped=0, lr=[9.819417080471167e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:03:30,968] [INFO] [timer.py:259:stop] epoch=0/micro_step=9000/global_step=9000, RunningAvgSamplesPerSec=2.627906315021324, CurrSamplesPerSec=2.644573741731182, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:03:46,213] [INFO] [logging.py:96:log_dist] [Rank 0] step=9010, skipped=0, lr=[9.818968086908063e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:03:46,215] [INFO] [timer.py:259:stop] epoch=0/micro_step=9010/global_step=9010, RunningAvgSamplesPerSec=2.6279171375595682, CurrSamplesPerSec=2.632110661216609, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:04:01,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=9020, skipped=0, lr=[9.818518546149579e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:04:01,571] [INFO] [timer.py:259:stop] epoch=0/micro_step=9020/global_step=9020, RunningAvgSamplesPerSec=2.627909763608507, CurrSamplesPerSec=2.6421248676749096, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:04:16,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=9030, skipped=0, lr=[9.818068458246757e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:04:16,824] [INFO] [timer.py:259:stop] epoch=0/micro_step=9030/global_step=9030, RunningAvgSamplesPerSec=2.627922065143836, CurrSamplesPerSec=2.632785167276191, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:04:32,186] [INFO] [logging.py:96:log_dist] [Rank 0] step=9040, skipped=0, lr=[9.817617823250707e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:04:32,187] [INFO] [timer.py:259:stop] epoch=0/micro_step=9040/global_step=9040, RunningAvgSamplesPerSec=2.6279147108207184, CurrSamplesPerSec=2.6486778465345266, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:04:47,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=9050, skipped=0, lr=[9.8171666412126e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:04:47,456] [INFO] [timer.py:259:stop] epoch=0/micro_step=9050/global_step=9050, RunningAvgSamplesPerSec=2.6279211075270164, CurrSamplesPerSec=2.6323353204947852, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:05:02,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=9060, skipped=0, lr=[9.816714912183664e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:05:02,881] [INFO] [timer.py:259:stop] epoch=0/micro_step=9060/global_step=9060, RunningAvgSamplesPerSec=2.6279002823162516, CurrSamplesPerSec=2.6263898443726204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:05:18,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=9070, skipped=0, lr=[9.816262636215197e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:05:18,277] [INFO] [timer.py:259:stop] epoch=0/micro_step=9070/global_step=9070, RunningAvgSamplesPerSec=2.627882704913966, CurrSamplesPerSec=2.577781565914463, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:05:33,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=9080, skipped=0, lr=[9.815809813358552e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:05:33,673] [INFO] [timer.py:259:stop] epoch=0/micro_step=9080/global_step=9080, RunningAvgSamplesPerSec=2.627868429205861, CurrSamplesPerSec=2.574258983638956, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:05:49,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=9090, skipped=0, lr=[9.81535644366515e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:05:49,019] [INFO] [timer.py:259:stop] epoch=0/micro_step=9090/global_step=9090, RunningAvgSamplesPerSec=2.6278653940219123, CurrSamplesPerSec=2.583697749280771, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:06:04,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=9100, skipped=0, lr=[9.81490252718647e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:06:04,283] [INFO] [timer.py:259:stop] epoch=0/micro_step=9100/global_step=9100, RunningAvgSamplesPerSec=2.627872328693262, CurrSamplesPerSec=2.6356348728622327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:06:19,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=9110, skipped=0, lr=[9.814448063974054e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:06:19,624] [INFO] [timer.py:259:stop] epoch=0/micro_step=9110/global_step=9110, RunningAvgSamplesPerSec=2.6278650281091376, CurrSamplesPerSec=2.617779528492888, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:06:34,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=9120, skipped=0, lr=[9.813993054079505e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:06:34,954] [INFO] [timer.py:259:stop] epoch=0/micro_step=9120/global_step=9120, RunningAvgSamplesPerSec=2.6278618346113687, CurrSamplesPerSec=2.6288120951084846, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:06:50,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=9130, skipped=0, lr=[9.813537497554493e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:06:50,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=9130/global_step=9130, RunningAvgSamplesPerSec=2.6278724976287675, CurrSamplesPerSec=2.6334951556054893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:07:05,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=9140, skipped=0, lr=[9.813081394450744e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:07:05,478] [INFO] [timer.py:259:stop] epoch=0/micro_step=9140/global_step=9140, RunningAvgSamplesPerSec=2.627875833056071, CurrSamplesPerSec=2.6009307691302843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:07:20,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=9150, skipped=0, lr=[9.812624744820049e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:07:20,765] [INFO] [timer.py:259:stop] epoch=0/micro_step=9150/global_step=9150, RunningAvgSamplesPerSec=2.627877469611886, CurrSamplesPerSec=2.6518164772376087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:07:36,106] [INFO] [logging.py:96:log_dist] [Rank 0] step=9160, skipped=0, lr=[9.812167548714263e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:07:36,110] [INFO] [timer.py:259:stop] epoch=0/micro_step=9160/global_step=9160, RunningAvgSamplesPerSec=2.6278727066277456, CurrSamplesPerSec=2.6291985206871735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:07:51,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=9170, skipped=0, lr=[9.811709806185296e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:07:51,507] [INFO] [timer.py:259:stop] epoch=0/micro_step=9170/global_step=9170, RunningAvgSamplesPerSec=2.6278607185497305, CurrSamplesPerSec=2.615064067527016, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:08:06,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=9180, skipped=0, lr=[9.81125151728513e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:08:06,753] [INFO] [timer.py:259:stop] epoch=0/micro_step=9180/global_step=9180, RunningAvgSamplesPerSec=2.627873710449454, CurrSamplesPerSec=2.6546991518080643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:08:22,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=9190, skipped=0, lr=[9.810792682065799e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:08:22,044] [INFO] [timer.py:259:stop] epoch=0/micro_step=9190/global_step=9190, RunningAvgSamplesPerSec=2.627874840528602, CurrSamplesPerSec=2.641447647557311, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:08:37,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=9200, skipped=0, lr=[9.81033330057941e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:08:37,322] [INFO] [timer.py:259:stop] epoch=0/micro_step=9200/global_step=9200, RunningAvgSamplesPerSec=2.6278811051883313, CurrSamplesPerSec=2.6306550224690533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:08:52,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=9210, skipped=0, lr=[9.809873372878118e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:08:52,602] [INFO] [timer.py:259:stop] epoch=0/micro_step=9210/global_step=9210, RunningAvgSamplesPerSec=2.6278824689506, CurrSamplesPerSec=2.6286395173977017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:09:07,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=9220, skipped=0, lr=[9.809412899014157e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:09:07,968] [INFO] [timer.py:259:stop] epoch=0/micro_step=9220/global_step=9220, RunningAvgSamplesPerSec=2.627874637859883, CurrSamplesPerSec=2.6439765145447236, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:09:23,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=9230, skipped=0, lr=[9.808951879039806e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:09:23,302] [INFO] [timer.py:259:stop] epoch=0/micro_step=9230/global_step=9230, RunningAvgSamplesPerSec=2.6278704283318755, CurrSamplesPerSec=2.6382231413059096, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:09:38,630] [INFO] [logging.py:96:log_dist] [Rank 0] step=9240, skipped=0, lr=[9.80849031300742e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:09:38,645] [INFO] [timer.py:259:stop] epoch=0/micro_step=9240/global_step=9240, RunningAvgSamplesPerSec=2.627863497927524, CurrSamplesPerSec=2.6172380266775113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:09:53,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=9250, skipped=0, lr=[9.808028200969404e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:09:53,979] [INFO] [timer.py:259:stop] epoch=0/micro_step=9250/global_step=9250, RunningAvgSamplesPerSec=2.627861397359797, CurrSamplesPerSec=2.629164734815037, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:10:09,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=9260, skipped=0, lr=[9.807565542978236e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:10:09,292] [INFO] [timer.py:259:stop] epoch=0/micro_step=9260/global_step=9260, RunningAvgSamplesPerSec=2.6278602561959414, CurrSamplesPerSec=2.6449510549360538, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:10:24,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=9270, skipped=0, lr=[9.807102339086449e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:10:24,552] [INFO] [timer.py:259:stop] epoch=0/micro_step=9270/global_step=9270, RunningAvgSamplesPerSec=2.6278668404170173, CurrSamplesPerSec=2.639065163988035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:10:39,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=9280, skipped=0, lr=[9.80663858934664e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:10:39,910] [INFO] [timer.py:259:stop] epoch=0/micro_step=9280/global_step=9280, RunningAvgSamplesPerSec=2.627857862311438, CurrSamplesPerSec=2.6137510003210442, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:10:55,205] [INFO] [logging.py:96:log_dist] [Rank 0] step=9290, skipped=0, lr=[9.806174293811469e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:10:55,206] [INFO] [timer.py:259:stop] epoch=0/micro_step=9290/global_step=9290, RunningAvgSamplesPerSec=2.6278597171889846, CurrSamplesPerSec=2.6295059296072703, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:11:10,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=9300, skipped=0, lr=[9.805709452533654e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:11:10,574] [INFO] [timer.py:259:stop] epoch=0/micro_step=9300/global_step=9300, RunningAvgSamplesPerSec=2.627852153736809, CurrSamplesPerSec=2.628006651834991, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:11:25,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=9310, skipped=0, lr=[9.80524406556598e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:11:25,851] [INFO] [timer.py:259:stop] epoch=0/micro_step=9310/global_step=9310, RunningAvgSamplesPerSec=2.6278565367343867, CurrSamplesPerSec=2.619523166254469, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:11:41,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=9320, skipped=0, lr=[9.804778132961292e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:11:41,189] [INFO] [timer.py:259:stop] epoch=0/micro_step=9320/global_step=9320, RunningAvgSamplesPerSec=2.627852592709645, CurrSamplesPerSec=2.64718085812277, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:11:56,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=9330, skipped=0, lr=[9.804311654772496e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:11:56,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=9330/global_step=9330, RunningAvgSamplesPerSec=2.6278500185218836, CurrSamplesPerSec=2.6376432923716, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:12:11,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=9340, skipped=0, lr=[9.803844631052562e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:12:11,882] [INFO] [timer.py:259:stop] epoch=0/micro_step=9340/global_step=9340, RunningAvgSamplesPerSec=2.6278390279820947, CurrSamplesPerSec=2.6309697859954677, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:12:27,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=9350, skipped=0, lr=[9.80337706185452e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:12:27,203] [INFO] [timer.py:259:stop] epoch=0/micro_step=9350/global_step=9350, RunningAvgSamplesPerSec=2.627835462624591, CurrSamplesPerSec=2.637528431241392, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:12:42,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=9360, skipped=0, lr=[9.802908947231462e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:12:42,505] [INFO] [timer.py:259:stop] epoch=0/micro_step=9360/global_step=9360, RunningAvgSamplesPerSec=2.6278352513163368, CurrSamplesPerSec=2.644955641725683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:12:57,760] [INFO] [logging.py:96:log_dist] [Rank 0] step=9370, skipped=0, lr=[9.802440287236544e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:12:57,770] [INFO] [timer.py:259:stop] epoch=0/micro_step=9370/global_step=9370, RunningAvgSamplesPerSec=2.627842100830633, CurrSamplesPerSec=2.585577556792658, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:13:13,048] [INFO] [logging.py:96:log_dist] [Rank 0] step=9380, skipped=0, lr=[9.801971081922981e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:13:13,054] [INFO] [timer.py:259:stop] epoch=0/micro_step=9380/global_step=9380, RunningAvgSamplesPerSec=2.627846436298283, CurrSamplesPerSec=2.6372668174206253, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:13:28,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=9390, skipped=0, lr=[9.801501331344051e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:13:28,381] [INFO] [timer.py:259:stop] epoch=0/micro_step=9390/global_step=9390, RunningAvgSamplesPerSec=2.627844922340503, CurrSamplesPerSec=2.6003079496841783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:13:43,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=9400, skipped=0, lr=[9.801031035553096e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:13:43,654] [INFO] [timer.py:259:stop] epoch=0/micro_step=9400/global_step=9400, RunningAvgSamplesPerSec=2.6278509031356725, CurrSamplesPerSec=2.647821321863701, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:13:58,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=9410, skipped=0, lr=[9.800560194603519e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:13:58,971] [INFO] [timer.py:259:stop] epoch=0/micro_step=9410/global_step=9410, RunningAvgSamplesPerSec=2.6278543396923713, CurrSamplesPerSec=2.5994940983252017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:14:14,290] [INFO] [logging.py:96:log_dist] [Rank 0] step=9420, skipped=0, lr=[9.80008880854878e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:14:14,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=9420/global_step=9420, RunningAvgSamplesPerSec=2.627850142259167, CurrSamplesPerSec=2.616728990645137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:14:29,596] [INFO] [logging.py:96:log_dist] [Rank 0] step=9430, skipped=0, lr=[9.79961687744241e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:14:29,605] [INFO] [timer.py:259:stop] epoch=0/micro_step=9430/global_step=9430, RunningAvgSamplesPerSec=2.627850588447614, CurrSamplesPerSec=2.6456313244817578, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:14:44,893] [INFO] [logging.py:96:log_dist] [Rank 0] step=9440, skipped=0, lr=[9.799144401337993e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:14:44,905] [INFO] [timer.py:259:stop] epoch=0/micro_step=9440/global_step=9440, RunningAvgSamplesPerSec=2.6278519557266105, CurrSamplesPerSec=2.62597588330505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:15:00,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=9450, skipped=0, lr=[9.798671380289183e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:15:00,261] [INFO] [timer.py:259:stop] epoch=0/micro_step=9450/global_step=9450, RunningAvgSamplesPerSec=2.627846551667137, CurrSamplesPerSec=2.6317489742853097, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:15:15,560] [INFO] [logging.py:96:log_dist] [Rank 0] step=9460, skipped=0, lr=[9.798197814349685e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:15:15,580] [INFO] [timer.py:259:stop] epoch=0/micro_step=9460/global_step=9460, RunningAvgSamplesPerSec=2.6278413160739094, CurrSamplesPerSec=2.6147217199977297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:15:30,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=9470, skipped=0, lr=[9.797723703573282e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:15:30,912] [INFO] [timer.py:259:stop] epoch=0/micro_step=9470/global_step=9470, RunningAvgSamplesPerSec=2.627836643449345, CurrSamplesPerSec=2.595271073057938, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:15:46,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=9480, skipped=0, lr=[9.797249048013799e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:15:46,211] [INFO] [timer.py:259:stop] epoch=0/micro_step=9480/global_step=9480, RunningAvgSamplesPerSec=2.627836983443478, CurrSamplesPerSec=2.6152401668806973, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:16:01,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=9490, skipped=0, lr=[9.796773847725141e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:16:01,525] [INFO] [timer.py:259:stop] epoch=0/micro_step=9490/global_step=9490, RunningAvgSamplesPerSec=2.6278374346881095, CurrSamplesPerSec=2.6274489781514623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:16:16,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=9500, skipped=0, lr=[9.796298102761264e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:16:16,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=9500/global_step=9500, RunningAvgSamplesPerSec=2.6278181729242185, CurrSamplesPerSec=2.626695363085452, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:16:32,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=9510, skipped=0, lr=[9.795821813176191e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:16:32,251] [INFO] [timer.py:259:stop] epoch=0/micro_step=9510/global_step=9510, RunningAvgSamplesPerSec=2.62781408921821, CurrSamplesPerSec=2.6330058094872744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:16:47,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=9520, skipped=0, lr=[9.795344979024002e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:16:47,576] [INFO] [timer.py:259:stop] epoch=0/micro_step=9520/global_step=9520, RunningAvgSamplesPerSec=2.627809104086843, CurrSamplesPerSec=2.6031604469607266, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:17:02,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=9530, skipped=0, lr=[9.794867600358843e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:17:02,920] [INFO] [timer.py:259:stop] epoch=0/micro_step=9530/global_step=9530, RunningAvgSamplesPerSec=2.6278110348840222, CurrSamplesPerSec=2.627329242940413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:17:18,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=9540, skipped=0, lr=[9.794389677234922e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:17:18,251] [INFO] [timer.py:259:stop] epoch=0/micro_step=9540/global_step=9540, RunningAvgSamplesPerSec=2.6278064065279585, CurrSamplesPerSec=2.598339867709258, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:17:33,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=9550, skipped=0, lr=[9.793911209706506e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:17:33,578] [INFO] [timer.py:259:stop] epoch=0/micro_step=9550/global_step=9550, RunningAvgSamplesPerSec=2.627801167350663, CurrSamplesPerSec=2.6392254125005756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:17:48,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=9560, skipped=0, lr=[9.793432197827924e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:17:48,905] [INFO] [timer.py:259:stop] epoch=0/micro_step=9560/global_step=9560, RunningAvgSamplesPerSec=2.627795440712078, CurrSamplesPerSec=2.590642844952818, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:18:04,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=9570, skipped=0, lr=[9.792952641653568e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:18:04,143] [INFO] [timer.py:259:stop] epoch=0/micro_step=9570/global_step=9570, RunningAvgSamplesPerSec=2.62780889480059, CurrSamplesPerSec=2.6530468270116754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:18:19,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=9580, skipped=0, lr=[9.792472541237894e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:18:19,451] [INFO] [timer.py:259:stop] epoch=0/micro_step=9580/global_step=9580, RunningAvgSamplesPerSec=2.627807093654451, CurrSamplesPerSec=2.5948596386898695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:18:34,738] [INFO] [logging.py:96:log_dist] [Rank 0] step=9590, skipped=0, lr=[9.791991896635418e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:18:34,740] [INFO] [timer.py:259:stop] epoch=0/micro_step=9590/global_step=9590, RunningAvgSamplesPerSec=2.6278082337333806, CurrSamplesPerSec=2.633839956243113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:18:50,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=9600, skipped=0, lr=[9.791510707900713e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:18:50,123] [INFO] [timer.py:259:stop] epoch=0/micro_step=9600/global_step=9600, RunningAvgSamplesPerSec=2.6278009786240184, CurrSamplesPerSec=2.6472372465979745, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:19:05,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=9610, skipped=0, lr=[9.791028975088421e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:19:05,419] [INFO] [timer.py:259:stop] epoch=0/micro_step=9610/global_step=9610, RunningAvgSamplesPerSec=2.6278064481240766, CurrSamplesPerSec=2.6277498044394263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:19:20,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=9620, skipped=0, lr=[9.790546698253245e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:19:20,741] [INFO] [timer.py:259:stop] epoch=0/micro_step=9620/global_step=9620, RunningAvgSamplesPerSec=2.6278069587285544, CurrSamplesPerSec=2.617242109563238, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:19:36,065] [INFO] [logging.py:96:log_dist] [Rank 0] step=9630, skipped=0, lr=[9.790063877449943e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:19:36,066] [INFO] [timer.py:259:stop] epoch=0/micro_step=9630/global_step=9630, RunningAvgSamplesPerSec=2.6278026490867634, CurrSamplesPerSec=2.635638185249658, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:19:51,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=9640, skipped=0, lr=[9.789580512733343e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:19:51,353] [INFO] [timer.py:259:stop] epoch=0/micro_step=9640/global_step=9640, RunningAvgSamplesPerSec=2.627806578251308, CurrSamplesPerSec=2.634724695937556, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:20:06,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=9650, skipped=0, lr=[9.78909660415833e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:20:06,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=9650/global_step=9650, RunningAvgSamplesPerSec=2.6278153640091766, CurrSamplesPerSec=2.6339958489945623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:20:21,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=9660, skipped=0, lr=[9.788612151779853e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:20:21,874] [INFO] [timer.py:259:stop] epoch=0/micro_step=9660/global_step=9660, RunningAvgSamplesPerSec=2.62782541990957, CurrSamplesPerSec=2.645046963848724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:20:37,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=9670, skipped=0, lr=[9.78812715565292e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:20:37,151] [INFO] [timer.py:259:stop] epoch=0/micro_step=9670/global_step=9670, RunningAvgSamplesPerSec=2.6278328856031195, CurrSamplesPerSec=2.62670358799015, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:20:52,398] [INFO] [logging.py:96:log_dist] [Rank 0] step=9680, skipped=0, lr=[9.787641615832604e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:20:52,402] [INFO] [timer.py:259:stop] epoch=0/micro_step=9680/global_step=9680, RunningAvgSamplesPerSec=2.62784528378419, CurrSamplesPerSec=2.665600590861088, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:21:07,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=9690, skipped=0, lr=[9.787155532374035e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:21:07,619] [INFO] [timer.py:259:stop] epoch=0/micro_step=9690/global_step=9690, RunningAvgSamplesPerSec=2.627860019453646, CurrSamplesPerSec=2.646453453379987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:21:22,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=9700, skipped=0, lr=[9.786668905332412e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:21:22,804] [INFO] [timer.py:259:stop] epoch=0/micro_step=9700/global_step=9700, RunningAvgSamplesPerSec=2.627878107916405, CurrSamplesPerSec=2.652852595750609, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:21:38,041] [INFO] [logging.py:96:log_dist] [Rank 0] step=9710, skipped=0, lr=[9.786181734762992e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:21:38,048] [INFO] [timer.py:259:stop] epoch=0/micro_step=9710/global_step=9710, RunningAvgSamplesPerSec=2.6278868006367118, CurrSamplesPerSec=2.6370487764626303, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:21:53,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=9720, skipped=0, lr=[9.785694020721088e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:21:53,226] [INFO] [timer.py:259:stop] epoch=0/micro_step=9720/global_step=9720, RunningAvgSamplesPerSec=2.627907627895498, CurrSamplesPerSec=2.6457777679168117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:22:08,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=9730, skipped=0, lr=[9.785205763262086e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:22:08,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=9730/global_step=9730, RunningAvgSamplesPerSec=2.627919183473305, CurrSamplesPerSec=2.603408065844476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:22:23,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=9740, skipped=0, lr=[9.784716962441425e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:22:23,635] [INFO] [timer.py:259:stop] epoch=0/micro_step=9740/global_step=9740, RunningAvgSamplesPerSec=2.627938520408433, CurrSamplesPerSec=2.6461470776187976, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:22:38,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=9750, skipped=0, lr=[9.784227618314608e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:22:38,792] [INFO] [timer.py:259:stop] epoch=0/micro_step=9750/global_step=9750, RunningAvgSamplesPerSec=2.627962168255668, CurrSamplesPerSec=2.6279761897440723, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:22:54,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=9760, skipped=0, lr=[9.783737730937202e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:22:54,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=9760/global_step=9760, RunningAvgSamplesPerSec=2.6279755956480355, CurrSamplesPerSec=2.6493671470403357, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:23:09,268] [INFO] [logging.py:96:log_dist] [Rank 0] step=9770, skipped=0, lr=[9.78324730036483e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:23:09,270] [INFO] [timer.py:259:stop] epoch=0/micro_step=9770/global_step=9770, RunningAvgSamplesPerSec=2.627981919156936, CurrSamplesPerSec=2.5984320235580087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:23:24,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=9780, skipped=0, lr=[9.782756326653186e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:23:24,520] [INFO] [timer.py:259:stop] epoch=0/micro_step=9780/global_step=9780, RunningAvgSamplesPerSec=2.6279900747737517, CurrSamplesPerSec=2.6419921417845695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:23:39,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=9790, skipped=0, lr=[9.782264809858019e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:23:39,707] [INFO] [timer.py:259:stop] epoch=0/micro_step=9790/global_step=9790, RunningAvgSamplesPerSec=2.628008738786406, CurrSamplesPerSec=2.6491320418744517, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:23:54,901] [INFO] [logging.py:96:log_dist] [Rank 0] step=9800, skipped=0, lr=[9.781772750035137e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:23:54,911] [INFO] [timer.py:259:stop] epoch=0/micro_step=9800/global_step=9800, RunningAvgSamplesPerSec=2.628022660239448, CurrSamplesPerSec=2.6482070865096867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:24:10,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=9810, skipped=0, lr=[9.781280147240418e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:24:10,160] [INFO] [timer.py:259:stop] epoch=0/micro_step=9810/global_step=9810, RunningAvgSamplesPerSec=2.6280285200677485, CurrSamplesPerSec=2.670191779787377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:24:25,290] [INFO] [logging.py:96:log_dist] [Rank 0] step=9820, skipped=0, lr=[9.780787001529793e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:24:25,291] [INFO] [timer.py:259:stop] epoch=0/micro_step=9820/global_step=9820, RunningAvgSamplesPerSec=2.6280556227399607, CurrSamplesPerSec=2.647094818115964, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:24:40,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=9830, skipped=0, lr=[9.780293312959264e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:24:40,520] [INFO] [timer.py:259:stop] epoch=0/micro_step=9830/global_step=9830, RunningAvgSamplesPerSec=2.628064136157982, CurrSamplesPerSec=2.6568995651580196, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:24:55,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=9840, skipped=0, lr=[9.779799081584885e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:24:55,753] [INFO] [timer.py:259:stop] epoch=0/micro_step=9840/global_step=9840, RunningAvgSamplesPerSec=2.6280746941333217, CurrSamplesPerSec=2.627156448474132, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:25:11,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=9850, skipped=0, lr=[9.779304307462777e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:25:11,032] [INFO] [timer.py:259:stop] epoch=0/micro_step=9850/global_step=9850, RunningAvgSamplesPerSec=2.6280759753160456, CurrSamplesPerSec=2.621401828168925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:25:26,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=9860, skipped=0, lr=[9.778808990649124e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:25:26,307] [INFO] [timer.py:259:stop] epoch=0/micro_step=9860/global_step=9860, RunningAvgSamplesPerSec=2.6280777937411406, CurrSamplesPerSec=2.641515437125747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:25:41,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=9870, skipped=0, lr=[9.778313131200167e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:25:41,501] [INFO] [timer.py:259:stop] epoch=0/micro_step=9870/global_step=9870, RunningAvgSamplesPerSec=2.628095678563949, CurrSamplesPerSec=2.65431737270974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:25:56,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=9880, skipped=0, lr=[9.777816729172211e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:25:56,669] [INFO] [timer.py:259:stop] epoch=0/micro_step=9880/global_step=9880, RunningAvgSamplesPerSec=2.628116265909643, CurrSamplesPerSec=2.65406711289775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:26:11,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=9890, skipped=0, lr=[9.777319784621626e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:26:11,909] [INFO] [timer.py:259:stop] epoch=0/micro_step=9890/global_step=9890, RunningAvgSamplesPerSec=2.628124959133059, CurrSamplesPerSec=2.619322362055299, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:26:27,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=9900, skipped=0, lr=[9.776822297604836e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:26:27,114] [INFO] [timer.py:259:stop] epoch=0/micro_step=9900/global_step=9900, RunningAvgSamplesPerSec=2.6281409959136712, CurrSamplesPerSec=2.6488714669373996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:26:42,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=9910, skipped=0, lr=[9.776324268178333e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:26:42,376] [INFO] [timer.py:259:stop] epoch=0/micro_step=9910/global_step=9910, RunningAvgSamplesPerSec=2.6281469591140967, CurrSamplesPerSec=2.6465924728634285, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:26:57,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=9920, skipped=0, lr=[9.775825696398668e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:26:57,601] [INFO] [timer.py:259:stop] epoch=0/micro_step=9920/global_step=9920, RunningAvgSamplesPerSec=2.6281617852354424, CurrSamplesPerSec=2.6536452212517476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:27:12,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=9930, skipped=0, lr=[9.775326582322455e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:27:12,795] [INFO] [timer.py:259:stop] epoch=0/micro_step=9930/global_step=9930, RunningAvgSamplesPerSec=2.62817899849327, CurrSamplesPerSec=2.6300059316601225, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:27:28,034] [INFO] [logging.py:96:log_dist] [Rank 0] step=9940, skipped=0, lr=[9.774826926006365e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:27:28,046] [INFO] [timer.py:259:stop] epoch=0/micro_step=9940/global_step=9940, RunningAvgSamplesPerSec=2.628189045023668, CurrSamplesPerSec=2.5888107928713655, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:27:43,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=9950, skipped=0, lr=[9.774326727507139e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:27:43,237] [INFO] [timer.py:259:stop] epoch=0/micro_step=9950/global_step=9950, RunningAvgSamplesPerSec=2.6282063000955667, CurrSamplesPerSec=2.6697452040852654, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:27:58,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=9960, skipped=0, lr=[9.773825986881571e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:27:58,460] [INFO] [timer.py:259:stop] epoch=0/micro_step=9960/global_step=9960, RunningAvgSamplesPerSec=2.6282170792257205, CurrSamplesPerSec=2.568316698905941, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:28:13,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=9970, skipped=0, lr=[9.773324704186522e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:28:13,614] [INFO] [timer.py:259:stop] epoch=0/micro_step=9970/global_step=9970, RunningAvgSamplesPerSec=2.6282398405161356, CurrSamplesPerSec=2.654358947282998, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:28:28,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=9980, skipped=0, lr=[9.772822879478913e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:28:28,798] [INFO] [timer.py:259:stop] epoch=0/micro_step=9980/global_step=9980, RunningAvgSamplesPerSec=2.628255867821591, CurrSamplesPerSec=2.6072021528388247, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:28:43,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=9990, skipped=0, lr=[9.772320512815724e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:28:43,977] [INFO] [timer.py:259:stop] epoch=0/micro_step=9990/global_step=9990, RunningAvgSamplesPerSec=2.6282733851505564, CurrSamplesPerSec=2.645085329361369, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:28:59,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=10000, skipped=0, lr=[9.771817604254004e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:28:59,186] [INFO] [timer.py:259:stop] epoch=0/micro_step=10000/global_step=10000, RunningAvgSamplesPerSec=2.628284980547973, CurrSamplesPerSec=2.6207233164030113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:29:14,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=10010, skipped=0, lr=[9.771314153850852e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:29:14,348] [INFO] [timer.py:259:stop] epoch=0/micro_step=10010/global_step=10010, RunningAvgSamplesPerSec=2.6283070348590756, CurrSamplesPerSec=2.651328679121483, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:29:29,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=10020, skipped=0, lr=[9.77081016166344e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:29:29,547] [INFO] [timer.py:259:stop] epoch=0/micro_step=10020/global_step=10020, RunningAvgSamplesPerSec=2.628323509014994, CurrSamplesPerSec=2.656380874216997, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:29:44,754] [INFO] [logging.py:96:log_dist] [Rank 0] step=10030, skipped=0, lr=[9.770305627748993e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:29:44,755] [INFO] [timer.py:259:stop] epoch=0/micro_step=10030/global_step=10030, RunningAvgSamplesPerSec=2.6283377586074113, CurrSamplesPerSec=2.6461182802752665, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:29:59,934] [INFO] [logging.py:96:log_dist] [Rank 0] step=10040, skipped=0, lr=[9.769800552164802e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:29:59,936] [INFO] [timer.py:259:stop] epoch=0/micro_step=10040/global_step=10040, RunningAvgSamplesPerSec=2.6283557577013346, CurrSamplesPerSec=2.638076288502372, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:30:15,105] [INFO] [logging.py:96:log_dist] [Rank 0] step=10050, skipped=0, lr=[9.769294934968222e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:30:15,125] [INFO] [timer.py:259:stop] epoch=0/micro_step=10050/global_step=10050, RunningAvgSamplesPerSec=2.628374186013013, CurrSamplesPerSec=2.6439560977286343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:30:30,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=10060, skipped=0, lr=[9.76878877621666e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:30:30,363] [INFO] [timer.py:259:stop] epoch=0/micro_step=10060/global_step=10060, RunningAvgSamplesPerSec=2.6283835055197917, CurrSamplesPerSec=2.646937371033389, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:30:45,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=10070, skipped=0, lr=[9.768282075967595e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:30:45,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=10070/global_step=10070, RunningAvgSamplesPerSec=2.6283977083872156, CurrSamplesPerSec=2.6638500772618183, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:31:00,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=10080, skipped=0, lr=[9.767774834278561e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:31:00,851] [INFO] [timer.py:259:stop] epoch=0/micro_step=10080/global_step=10080, RunningAvgSamplesPerSec=2.6283996237665446, CurrSamplesPerSec=2.6323332554370893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:31:16,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=10090, skipped=0, lr=[9.767267051207157e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:31:16,032] [INFO] [timer.py:259:stop] epoch=0/micro_step=10090/global_step=10090, RunningAvgSamplesPerSec=2.628417332910283, CurrSamplesPerSec=2.6483768083988304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:31:31,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=10100, skipped=0, lr=[9.766758726811038e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:31:31,249] [INFO] [timer.py:259:stop] epoch=0/micro_step=10100/global_step=10100, RunningAvgSamplesPerSec=2.6284300202101587, CurrSamplesPerSec=2.651419603820158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:31:46,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=10110, skipped=0, lr=[9.76624986114793e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:31:46,422] [INFO] [timer.py:259:stop] epoch=0/micro_step=10110/global_step=10110, RunningAvgSamplesPerSec=2.628448531704936, CurrSamplesPerSec=2.6354030264302932, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:32:01,665] [INFO] [logging.py:96:log_dist] [Rank 0] step=10120, skipped=0, lr=[9.765740454275613e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:32:01,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=10120/global_step=10120, RunningAvgSamplesPerSec=2.6284547852733224, CurrSamplesPerSec=2.647540114691973, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:32:16,934] [INFO] [logging.py:96:log_dist] [Rank 0] step=10130, skipped=0, lr=[9.765230506251928e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:32:16,935] [INFO] [timer.py:259:stop] epoch=0/micro_step=10130/global_step=10130, RunningAvgSamplesPerSec=2.628460333241409, CurrSamplesPerSec=2.640932893092653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:32:32,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=10140, skipped=0, lr=[9.76472001713478e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:32:32,170] [INFO] [timer.py:259:stop] epoch=0/micro_step=10140/global_step=10140, RunningAvgSamplesPerSec=2.6284700343421084, CurrSamplesPerSec=2.636417241531435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:32:47,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=10150, skipped=0, lr=[9.764208986982137e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:32:47,380] [INFO] [timer.py:259:stop] epoch=0/micro_step=10150/global_step=10150, RunningAvgSamplesPerSec=2.6284830616685206, CurrSamplesPerSec=2.646445104326383, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:33:02,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=10160, skipped=0, lr=[9.763697415852028e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:33:02,616] [INFO] [timer.py:259:stop] epoch=0/micro_step=10160/global_step=10160, RunningAvgSamplesPerSec=2.628496609673365, CurrSamplesPerSec=2.6567994289899612, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:33:17,788] [INFO] [logging.py:96:log_dist] [Rank 0] step=10170, skipped=0, lr=[9.763185303802538e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:33:17,790] [INFO] [timer.py:259:stop] epoch=0/micro_step=10170/global_step=10170, RunningAvgSamplesPerSec=2.6285164934997467, CurrSamplesPerSec=2.5794330484220325, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:33:32,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=10180, skipped=0, lr=[9.76267265089182e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:33:33,002] [INFO] [timer.py:259:stop] epoch=0/micro_step=10180/global_step=10180, RunningAvgSamplesPerSec=2.6285321460261213, CurrSamplesPerSec=2.6453539191158626, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:33:48,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=10190, skipped=0, lr=[9.762159457178087e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:33:48,176] [INFO] [timer.py:259:stop] epoch=0/micro_step=10190/global_step=10190, RunningAvgSamplesPerSec=2.6285507340539573, CurrSamplesPerSec=2.6390427473539426, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:34:03,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=10200, skipped=0, lr=[9.761645722719608e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:34:03,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=10200/global_step=10200, RunningAvgSamplesPerSec=2.6285722234291016, CurrSamplesPerSec=2.6462768821207887, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:34:18,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=10210, skipped=0, lr=[9.761131447574724e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:34:18,530] [INFO] [timer.py:259:stop] epoch=0/micro_step=10210/global_step=10210, RunningAvgSamplesPerSec=2.628591144306556, CurrSamplesPerSec=2.660711184836754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:34:33,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=10220, skipped=0, lr=[9.760616631801824e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:34:33,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=10220/global_step=10220, RunningAvgSamplesPerSec=2.6286083667792375, CurrSamplesPerSec=2.643782775827912, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:34:48,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=10230, skipped=0, lr=[9.760101275459371e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:34:48,931] [INFO] [timer.py:259:stop] epoch=0/micro_step=10230/global_step=10230, RunningAvgSamplesPerSec=2.628623918070626, CurrSamplesPerSec=2.6581805431518055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:35:04,119] [INFO] [logging.py:96:log_dist] [Rank 0] step=10240, skipped=0, lr=[9.759585378605883e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:35:04,127] [INFO] [timer.py:259:stop] epoch=0/micro_step=10240/global_step=10240, RunningAvgSamplesPerSec=2.6286412463236983, CurrSamplesPerSec=2.6472113493703304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:35:19,339] [INFO] [logging.py:96:log_dist] [Rank 0] step=10250, skipped=0, lr=[9.759068941299939e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:35:19,341] [INFO] [timer.py:259:stop] epoch=0/micro_step=10250/global_step=10250, RunningAvgSamplesPerSec=2.6286541044282954, CurrSamplesPerSec=2.623112564483499, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:35:34,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=10260, skipped=0, lr=[9.758551963600179e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:35:34,542] [INFO] [timer.py:259:stop] epoch=0/micro_step=10260/global_step=10260, RunningAvgSamplesPerSec=2.6286692157978337, CurrSamplesPerSec=2.6730306636381287, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:35:49,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=10270, skipped=0, lr=[9.758034445565309e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:35:49,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=10270/global_step=10270, RunningAvgSamplesPerSec=2.6286901046100524, CurrSamplesPerSec=2.644624182897593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:36:04,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=10280, skipped=0, lr=[9.757516387254092e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:36:04,900] [INFO] [timer.py:259:stop] epoch=0/micro_step=10280/global_step=10280, RunningAvgSamplesPerSec=2.628705712305178, CurrSamplesPerSec=2.661272515580721, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:36:20,157] [INFO] [logging.py:96:log_dist] [Rank 0] step=10290, skipped=0, lr=[9.756997788725354e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:36:20,163] [INFO] [timer.py:259:stop] epoch=0/micro_step=10290/global_step=10290, RunningAvgSamplesPerSec=2.628708606943521, CurrSamplesPerSec=2.634217107787206, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:36:35,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=10300, skipped=0, lr=[9.756478650037982e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:36:35,320] [INFO] [timer.py:259:stop] epoch=0/micro_step=10300/global_step=10300, RunningAvgSamplesPerSec=2.6287307434275977, CurrSamplesPerSec=2.664496940684392, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:36:50,555] [INFO] [logging.py:96:log_dist] [Rank 0] step=10310, skipped=0, lr=[9.755958971250925e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:36:50,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=10310/global_step=10310, RunningAvgSamplesPerSec=2.6287395847254444, CurrSamplesPerSec=2.6437669446715724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:37:05,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=10320, skipped=0, lr=[9.755438752423191e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:37:05,773] [INFO] [timer.py:259:stop] epoch=0/micro_step=10320/global_step=10320, RunningAvgSamplesPerSec=2.6287521073111653, CurrSamplesPerSec=2.646729836787322, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:37:20,936] [INFO] [logging.py:96:log_dist] [Rank 0] step=10330, skipped=0, lr=[9.754917993613852e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:37:20,944] [INFO] [timer.py:259:stop] epoch=0/micro_step=10330/global_step=10330, RunningAvgSamplesPerSec=2.628769845317283, CurrSamplesPerSec=2.654876008604456, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:37:36,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=10340, skipped=0, lr=[9.754396694882042e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:37:36,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=10340/global_step=10340, RunningAvgSamplesPerSec=2.628777511897728, CurrSamplesPerSec=2.6475392790984578, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:37:51,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=10350, skipped=0, lr=[9.75387485628695e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:37:51,403] [INFO] [timer.py:259:stop] epoch=0/micro_step=10350/global_step=10350, RunningAvgSamplesPerSec=2.6287881143480956, CurrSamplesPerSec=2.668201823577732, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:38:06,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=10360, skipped=0, lr=[9.753352477887836e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:38:06,584] [INFO] [timer.py:259:stop] epoch=0/micro_step=10360/global_step=10360, RunningAvgSamplesPerSec=2.628807235767919, CurrSamplesPerSec=2.642892359635334, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:38:21,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=10370, skipped=0, lr=[9.752829559744015e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:38:21,833] [INFO] [timer.py:259:stop] epoch=0/micro_step=10370/global_step=10370, RunningAvgSamplesPerSec=2.6288156608639777, CurrSamplesPerSec=2.6512595470033764, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:38:37,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=10380, skipped=0, lr=[9.752306101914861e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:38:37,022] [INFO] [timer.py:259:stop] epoch=0/micro_step=10380/global_step=10380, RunningAvgSamplesPerSec=2.6288337921856417, CurrSamplesPerSec=2.632028075534238, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:38:52,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=10390, skipped=0, lr=[9.751782104459818e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:38:52,363] [INFO] [timer.py:259:stop] epoch=0/micro_step=10390/global_step=10390, RunningAvgSamplesPerSec=2.6288272115418314, CurrSamplesPerSec=2.6465636658248886, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:39:07,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=10400, skipped=0, lr=[9.751257567438383e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:39:07,615] [INFO] [timer.py:259:stop] epoch=0/micro_step=10400/global_step=10400, RunningAvgSamplesPerSec=2.6288339887697574, CurrSamplesPerSec=2.672367306131352, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:39:22,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=10410, skipped=0, lr=[9.750732490910117e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:39:22,794] [INFO] [timer.py:259:stop] epoch=0/micro_step=10410/global_step=10410, RunningAvgSamplesPerSec=2.628854097050057, CurrSamplesPerSec=2.6372473332064734, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:39:38,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=10420, skipped=0, lr=[9.750206874934645e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:39:38,029] [INFO] [timer.py:259:stop] epoch=0/micro_step=10420/global_step=10420, RunningAvgSamplesPerSec=2.628862414450404, CurrSamplesPerSec=2.617491189683531, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:39:53,169] [INFO] [logging.py:96:log_dist] [Rank 0] step=10430, skipped=0, lr=[9.749680719571647e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:39:53,177] [INFO] [timer.py:259:stop] epoch=0/micro_step=10430/global_step=10430, RunningAvgSamplesPerSec=2.6288875047831617, CurrSamplesPerSec=2.6775069134791436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:40:08,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=10440, skipped=0, lr=[9.749154024880872e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:40:08,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=10440/global_step=10440, RunningAvgSamplesPerSec=2.628897821450815, CurrSamplesPerSec=2.594241728813829, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:40:23,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=10450, skipped=0, lr=[9.748626790922125e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:40:23,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=10450/global_step=10450, RunningAvgSamplesPerSec=2.6289127982506235, CurrSamplesPerSec=2.66037070332805, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:40:38,825] [INFO] [logging.py:96:log_dist] [Rank 0] step=10460, skipped=0, lr=[9.748099017755274e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:40:38,832] [INFO] [timer.py:259:stop] epoch=0/micro_step=10460/global_step=10460, RunningAvgSamplesPerSec=2.6289203260669765, CurrSamplesPerSec=2.6022564120082223, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:40:53,984] [INFO] [logging.py:96:log_dist] [Rank 0] step=10470, skipped=0, lr=[9.747570705440246e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:40:54,003] [INFO] [timer.py:259:stop] epoch=0/micro_step=10470/global_step=10470, RunningAvgSamplesPerSec=2.6289394028129394, CurrSamplesPerSec=2.661091428886277, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:41:09,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=10480, skipped=0, lr=[9.747041854037033e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:41:09,205] [INFO] [timer.py:259:stop] epoch=0/micro_step=10480/global_step=10480, RunningAvgSamplesPerSec=2.628953462450098, CurrSamplesPerSec=2.6217111036471343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:41:24,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=10490, skipped=0, lr=[9.746512463605686e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:41:24,406] [INFO] [timer.py:259:stop] epoch=0/micro_step=10490/global_step=10490, RunningAvgSamplesPerSec=2.6289677947305545, CurrSamplesPerSec=2.6419334803584613, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:41:39,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=10500, skipped=0, lr=[9.745982534206316e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:41:39,589] [INFO] [timer.py:259:stop] epoch=0/micro_step=10500/global_step=10500, RunningAvgSamplesPerSec=2.6289838132381447, CurrSamplesPerSec=2.6424057579623943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:41:54,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=10510, skipped=0, lr=[9.745452065899098e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:41:54,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=10510/global_step=10510, RunningAvgSamplesPerSec=2.629008648436687, CurrSamplesPerSec=2.6500848523426996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:42:09,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=10520, skipped=0, lr=[9.744921058744265e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:42:09,999] [INFO] [timer.py:259:stop] epoch=0/micro_step=10520/global_step=10520, RunningAvgSamplesPerSec=2.62901232096739, CurrSamplesPerSec=2.6513232322177056, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:42:25,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=10530, skipped=0, lr=[9.744389512802118e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:42:25,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=10530/global_step=10530, RunningAvgSamplesPerSec=2.629019861482256, CurrSamplesPerSec=2.615494982233175, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:42:40,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=10540, skipped=0, lr=[9.743857428133007e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:42:40,492] [INFO] [timer.py:259:stop] epoch=0/micro_step=10540/global_step=10540, RunningAvgSamplesPerSec=2.629028846060242, CurrSamplesPerSec=2.6562357779744667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:42:55,630] [INFO] [logging.py:96:log_dist] [Rank 0] step=10550, skipped=0, lr=[9.743324804797356e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:42:55,646] [INFO] [timer.py:259:stop] epoch=0/micro_step=10550/global_step=10550, RunningAvgSamplesPerSec=2.629051301278663, CurrSamplesPerSec=2.649470070973897, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:43:10,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=10560, skipped=0, lr=[9.742791642855643e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:43:10,912] [INFO] [timer.py:259:stop] epoch=0/micro_step=10560/global_step=10560, RunningAvgSamplesPerSec=2.629060954568871, CurrSamplesPerSec=2.6429648032659374, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:43:26,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=10570, skipped=0, lr=[9.742257942368407e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:43:26,119] [INFO] [timer.py:259:stop] epoch=0/micro_step=10570/global_step=10570, RunningAvgSamplesPerSec=2.629075604634484, CurrSamplesPerSec=2.638055547898601, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:43:41,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=10580, skipped=0, lr=[9.741723703396253e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:43:41,391] [INFO] [timer.py:259:stop] epoch=0/micro_step=10580/global_step=10580, RunningAvgSamplesPerSec=2.6290817085553457, CurrSamplesPerSec=2.635370322805864, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:43:56,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=10590, skipped=0, lr=[9.741188925999841e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:43:56,624] [INFO] [timer.py:259:stop] epoch=0/micro_step=10590/global_step=10590, RunningAvgSamplesPerSec=2.629092215864308, CurrSamplesPerSec=2.6560856515925835, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:44:11,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=10600, skipped=0, lr=[9.740653610239897e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:44:11,780] [INFO] [timer.py:259:stop] epoch=0/micro_step=10600/global_step=10600, RunningAvgSamplesPerSec=2.6291134645607745, CurrSamplesPerSec=2.6741435272201772, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:44:26,986] [INFO] [logging.py:96:log_dist] [Rank 0] step=10610, skipped=0, lr=[9.740117756177205e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:44:26,996] [INFO] [timer.py:259:stop] epoch=0/micro_step=10610/global_step=10610, RunningAvgSamplesPerSec=2.6291303201560217, CurrSamplesPerSec=2.6668145179898026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:44:42,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=10620, skipped=0, lr=[9.739581363872613e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:44:42,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=10620/global_step=10620, RunningAvgSamplesPerSec=2.62914544712349, CurrSamplesPerSec=2.657420984225078, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:44:57,416] [INFO] [logging.py:96:log_dist] [Rank 0] step=10630, skipped=0, lr=[9.739044433387028e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:44:57,417] [INFO] [timer.py:259:stop] epoch=0/micro_step=10630/global_step=10630, RunningAvgSamplesPerSec=2.6291551690130714, CurrSamplesPerSec=2.6369285791848354, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:45:12,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=10640, skipped=0, lr=[9.738506964781418e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:45:12,600] [INFO] [timer.py:259:stop] epoch=0/micro_step=10640/global_step=10640, RunningAvgSamplesPerSec=2.6291707463035237, CurrSamplesPerSec=2.6654007060992173, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:45:27,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=10650, skipped=0, lr=[9.737968958116812e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:45:27,787] [INFO] [timer.py:259:stop] epoch=0/micro_step=10650/global_step=10650, RunningAvgSamplesPerSec=2.6291872200266075, CurrSamplesPerSec=2.62624430594441, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:45:43,037] [INFO] [logging.py:96:log_dist] [Rank 0] step=10660, skipped=0, lr=[9.737430413454301e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:45:43,038] [INFO] [timer.py:259:stop] epoch=0/micro_step=10660/global_step=10660, RunningAvgSamplesPerSec=2.6291915473052336, CurrSamplesPerSec=2.6432779386573486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:45:58,193] [INFO] [logging.py:96:log_dist] [Rank 0] step=10670, skipped=0, lr=[9.736891330855039e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:45:58,214] [INFO] [timer.py:259:stop] epoch=0/micro_step=10670/global_step=10670, RunningAvgSamplesPerSec=2.629207360737655, CurrSamplesPerSec=2.6485273189124947, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:46:13,390] [INFO] [logging.py:96:log_dist] [Rank 0] step=10680, skipped=0, lr=[9.736351710380239e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:46:13,391] [INFO] [timer.py:259:stop] epoch=0/micro_step=10680/global_step=10680, RunningAvgSamplesPerSec=2.6292232307136927, CurrSamplesPerSec=2.642556839213836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:46:28,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=10690, skipped=0, lr=[9.735811552091172e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:46:28,613] [INFO] [timer.py:259:stop] epoch=0/micro_step=10690/global_step=10690, RunningAvgSamplesPerSec=2.6292339185632785, CurrSamplesPerSec=2.582361528777304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:46:43,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=10700, skipped=0, lr=[9.735270856049175e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:46:43,851] [INFO] [timer.py:259:stop] epoch=0/micro_step=10700/global_step=10700, RunningAvgSamplesPerSec=2.6292411113534806, CurrSamplesPerSec=2.6491826570774784, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:46:59,065] [INFO] [logging.py:96:log_dist] [Rank 0] step=10710, skipped=0, lr=[9.734729622315645e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:46:59,081] [INFO] [timer.py:259:stop] epoch=0/micro_step=10710/global_step=10710, RunningAvgSamplesPerSec=2.6292489053895163, CurrSamplesPerSec=2.648843028474137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:47:14,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=10720, skipped=0, lr=[9.734187850952037e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:47:14,302] [INFO] [timer.py:259:stop] epoch=0/micro_step=10720/global_step=10720, RunningAvgSamplesPerSec=2.629260745693588, CurrSamplesPerSec=2.6452763394544636, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:47:29,594] [INFO] [logging.py:96:log_dist] [Rank 0] step=10730, skipped=0, lr=[9.733645542019872e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:47:29,596] [INFO] [timer.py:259:stop] epoch=0/micro_step=10730/global_step=10730, RunningAvgSamplesPerSec=2.6292603074423764, CurrSamplesPerSec=2.6138784603590426, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:47:44,814] [INFO] [logging.py:96:log_dist] [Rank 0] step=10740, skipped=0, lr=[9.733102695580728e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:47:44,815] [INFO] [timer.py:259:stop] epoch=0/micro_step=10740/global_step=10740, RunningAvgSamplesPerSec=2.629273398799032, CurrSamplesPerSec=2.6015304894793765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:48:00,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=10750, skipped=0, lr=[9.732559311696246e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:48:00,020] [INFO] [timer.py:259:stop] epoch=0/micro_step=10750/global_step=10750, RunningAvgSamplesPerSec=2.6292868862235514, CurrSamplesPerSec=2.6422451230204214, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:48:15,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=10760, skipped=0, lr=[9.732015390428126e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:48:15,208] [INFO] [timer.py:259:stop] epoch=0/micro_step=10760/global_step=10760, RunningAvgSamplesPerSec=2.629304666424844, CurrSamplesPerSec=2.640744172343112, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:48:30,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=10770, skipped=0, lr=[9.731470931838131e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:48:30,446] [INFO] [timer.py:259:stop] epoch=0/micro_step=10770/global_step=10770, RunningAvgSamplesPerSec=2.6293147704964657, CurrSamplesPerSec=2.654872227571343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:48:45,631] [INFO] [logging.py:96:log_dist] [Rank 0] step=10780, skipped=0, lr=[9.730925935988085e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:48:45,640] [INFO] [timer.py:259:stop] epoch=0/micro_step=10780/global_step=10780, RunningAvgSamplesPerSec=2.6293281878452093, CurrSamplesPerSec=2.6747356993136613, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:49:00,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=10790, skipped=0, lr=[9.730380402939873e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:49:00,849] [INFO] [timer.py:259:stop] epoch=0/micro_step=10790/global_step=10790, RunningAvgSamplesPerSec=2.6293380749373996, CurrSamplesPerSec=2.6433608153591868, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:49:16,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=10800, skipped=0, lr=[9.729834332755439e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:49:16,080] [INFO] [timer.py:259:stop] epoch=0/micro_step=10800/global_step=10800, RunningAvgSamplesPerSec=2.6293484328008767, CurrSamplesPerSec=2.645067814532747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:49:31,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=10810, skipped=0, lr=[9.72928772549679e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:49:31,332] [INFO] [timer.py:259:stop] epoch=0/micro_step=10810/global_step=10810, RunningAvgSamplesPerSec=2.6293569018240683, CurrSamplesPerSec=2.641331207340464, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:49:46,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=10820, skipped=0, lr=[9.728740581225995e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:49:46,549] [INFO] [timer.py:259:stop] epoch=0/micro_step=10820/global_step=10820, RunningAvgSamplesPerSec=2.6293681094209074, CurrSamplesPerSec=2.6471624801758433, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:50:01,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=10830, skipped=0, lr=[9.72819290000518e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:50:01,823] [INFO] [timer.py:259:stop] epoch=0/micro_step=10830/global_step=10830, RunningAvgSamplesPerSec=2.629370497852652, CurrSamplesPerSec=2.6344574328077335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:50:17,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=10840, skipped=0, lr=[9.727644681896534e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:50:17,075] [INFO] [timer.py:259:stop] epoch=0/micro_step=10840/global_step=10840, RunningAvgSamplesPerSec=2.629376716750871, CurrSamplesPerSec=2.630312704672013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:50:32,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=10850, skipped=0, lr=[9.727095926962312e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:50:32,361] [INFO] [timer.py:259:stop] epoch=0/micro_step=10850/global_step=10850, RunningAvgSamplesPerSec=2.6293766339536067, CurrSamplesPerSec=2.6541032212994273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:50:47,675] [INFO] [logging.py:96:log_dist] [Rank 0] step=10860, skipped=0, lr=[9.726546635264817e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:50:47,678] [INFO] [timer.py:259:stop] epoch=0/micro_step=10860/global_step=10860, RunningAvgSamplesPerSec=2.629376200669431, CurrSamplesPerSec=2.6316420562059295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:51:02,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=10870, skipped=0, lr=[9.72599680686643e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:51:02,912] [INFO] [timer.py:259:stop] epoch=0/micro_step=10870/global_step=10870, RunningAvgSamplesPerSec=2.6293849166391445, CurrSamplesPerSec=2.664914247505597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:51:18,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=10880, skipped=0, lr=[9.725446441829581e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:51:18,179] [INFO] [timer.py:259:stop] epoch=0/micro_step=10880/global_step=10880, RunningAvgSamplesPerSec=2.629387413420764, CurrSamplesPerSec=2.6312924658407435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:51:33,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=10890, skipped=0, lr=[9.724895540216762e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:51:33,398] [INFO] [timer.py:259:stop] epoch=0/micro_step=10890/global_step=10890, RunningAvgSamplesPerSec=2.6293980526371246, CurrSamplesPerSec=2.6657297696300177, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:51:48,631] [INFO] [logging.py:96:log_dist] [Rank 0] step=10900, skipped=0, lr=[9.724344102090528e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:51:48,633] [INFO] [timer.py:259:stop] epoch=0/micro_step=10900/global_step=10900, RunningAvgSamplesPerSec=2.6294122336402554, CurrSamplesPerSec=2.582569427031064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:52:03,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=10910, skipped=0, lr=[9.7237921275135e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:52:03,828] [INFO] [timer.py:259:stop] epoch=0/micro_step=10910/global_step=10910, RunningAvgSamplesPerSec=2.629428778845926, CurrSamplesPerSec=2.657614622096688, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:52:19,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=10920, skipped=0, lr=[9.723239616548349e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:52:19,108] [INFO] [timer.py:259:stop] epoch=0/micro_step=10920/global_step=10920, RunningAvgSamplesPerSec=2.629429077447828, CurrSamplesPerSec=2.5967207624554582, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:52:34,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=10930, skipped=0, lr=[9.722686569257817e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:52:34,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=10930/global_step=10930, RunningAvgSamplesPerSec=2.6294426270820184, CurrSamplesPerSec=2.6526969795614144, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:52:49,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=10940, skipped=0, lr=[9.7221329857047e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:52:49,519] [INFO] [timer.py:259:stop] epoch=0/micro_step=10940/global_step=10940, RunningAvgSamplesPerSec=2.629455799462972, CurrSamplesPerSec=2.6417741507668424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:53:04,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=10950, skipped=0, lr=[9.721578865951858e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:53:04,728] [INFO] [timer.py:259:stop] epoch=0/micro_step=10950/global_step=10950, RunningAvgSamplesPerSec=2.6294723580489423, CurrSamplesPerSec=2.6429610560841983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:53:19,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=10960, skipped=0, lr=[9.721024210062214e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:53:19,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=10960/global_step=10960, RunningAvgSamplesPerSec=2.629476618292379, CurrSamplesPerSec=2.6354493924534848, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:53:35,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=10970, skipped=0, lr=[9.720469018098744e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:53:35,203] [INFO] [timer.py:259:stop] epoch=0/micro_step=10970/global_step=10970, RunningAvgSamplesPerSec=2.6294882920575517, CurrSamplesPerSec=2.6457998818671125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:53:50,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=10980, skipped=0, lr=[9.719913290124496e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:53:50,440] [INFO] [timer.py:259:stop] epoch=0/micro_step=10980/global_step=10980, RunningAvgSamplesPerSec=2.6294971558700944, CurrSamplesPerSec=2.638715673865253, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:54:05,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=10990, skipped=0, lr=[9.719357026202572e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:54:05,657] [INFO] [timer.py:259:stop] epoch=0/micro_step=10990/global_step=10990, RunningAvgSamplesPerSec=2.6295083747143964, CurrSamplesPerSec=2.63713499367236, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:54:20,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=11000, skipped=0, lr=[9.718800226396133e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:54:20,891] [INFO] [timer.py:259:stop] epoch=0/micro_step=11000/global_step=11000, RunningAvgSamplesPerSec=2.6295157320068037, CurrSamplesPerSec=2.6481519106798834, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:54:36,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=11010, skipped=0, lr=[9.718242890768405e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:54:36,135] [INFO] [timer.py:259:stop] epoch=0/micro_step=11010/global_step=11010, RunningAvgSamplesPerSec=2.6295259555314443, CurrSamplesPerSec=2.678398575388112, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:54:51,265] [INFO] [logging.py:96:log_dist] [Rank 0] step=11020, skipped=0, lr=[9.717685019382674e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:54:51,267] [INFO] [timer.py:259:stop] epoch=0/micro_step=11020/global_step=11020, RunningAvgSamplesPerSec=2.6295495583658735, CurrSamplesPerSec=2.6389522544436717, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:55:06,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=11030, skipped=0, lr=[9.717126612302288e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:55:06,433] [INFO] [timer.py:259:stop] epoch=0/micro_step=11030/global_step=11030, RunningAvgSamplesPerSec=2.6295671713854984, CurrSamplesPerSec=2.6404619729173446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:55:21,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=11040, skipped=0, lr=[9.716567669590654e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:55:21,691] [INFO] [timer.py:259:stop] epoch=0/micro_step=11040/global_step=11040, RunningAvgSamplesPerSec=2.6295743490069388, CurrSamplesPerSec=2.6453005304735475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:55:36,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=11050, skipped=0, lr=[9.716008191311236e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:55:36,924] [INFO] [timer.py:259:stop] epoch=0/micro_step=11050/global_step=11050, RunningAvgSamplesPerSec=2.6295864014887806, CurrSamplesPerSec=2.6188918205893805, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:55:52,130] [INFO] [logging.py:96:log_dist] [Rank 0] step=11060, skipped=0, lr=[9.715448177527567e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:55:52,136] [INFO] [timer.py:259:stop] epoch=0/micro_step=11060/global_step=11060, RunningAvgSamplesPerSec=2.6295999780635864, CurrSamplesPerSec=2.648057866269471, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:56:07,378] [INFO] [logging.py:96:log_dist] [Rank 0] step=11070, skipped=0, lr=[9.714887628303235e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:56:07,379] [INFO] [timer.py:259:stop] epoch=0/micro_step=11070/global_step=11070, RunningAvgSamplesPerSec=2.629605339762047, CurrSamplesPerSec=2.6428353234791606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:56:22,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=11080, skipped=0, lr=[9.714326543701893e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:56:22,619] [INFO] [timer.py:259:stop] epoch=0/micro_step=11080/global_step=11080, RunningAvgSamplesPerSec=2.6296157231970407, CurrSamplesPerSec=2.646132052839624, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:56:37,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=11090, skipped=0, lr=[9.71376492378725e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:56:37,861] [INFO] [timer.py:259:stop] epoch=0/micro_step=11090/global_step=11090, RunningAvgSamplesPerSec=2.629622448859646, CurrSamplesPerSec=2.6006594331142874, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:56:53,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=11100, skipped=0, lr=[9.713202768623078e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:56:53,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=11100/global_step=11100, RunningAvgSamplesPerSec=2.629631210811816, CurrSamplesPerSec=2.63734683008454, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:57:08,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=11110, skipped=0, lr=[9.712640078273212e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:57:08,274] [INFO] [timer.py:259:stop] epoch=0/micro_step=11110/global_step=11110, RunningAvgSamplesPerSec=2.6296477102370432, CurrSamplesPerSec=2.6260153416921757, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:57:23,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=11120, skipped=0, lr=[9.712076852801542e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:57:23,447] [INFO] [timer.py:259:stop] epoch=0/micro_step=11120/global_step=11120, RunningAvgSamplesPerSec=2.6296653456055736, CurrSamplesPerSec=2.6750406273818323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:57:38,640] [INFO] [logging.py:96:log_dist] [Rank 0] step=11130, skipped=0, lr=[9.711513092272026e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:57:38,641] [INFO] [timer.py:259:stop] epoch=0/micro_step=11130/global_step=11130, RunningAvgSamplesPerSec=2.629681213455703, CurrSamplesPerSec=2.617779528492888, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:57:53,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=11140, skipped=0, lr=[9.710948796748679e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:57:53,893] [INFO] [timer.py:259:stop] epoch=0/micro_step=11140/global_step=11140, RunningAvgSamplesPerSec=2.629688408002872, CurrSamplesPerSec=2.6395621638440345, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:58:09,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=11150, skipped=0, lr=[9.710383966295574e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:58:09,173] [INFO] [timer.py:259:stop] epoch=0/micro_step=11150/global_step=11150, RunningAvgSamplesPerSec=2.629687405001975, CurrSamplesPerSec=2.599178767191304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:58:24,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=11160, skipped=0, lr=[9.70981860097685e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:58:24,379] [INFO] [timer.py:259:stop] epoch=0/micro_step=11160/global_step=11160, RunningAvgSamplesPerSec=2.629698345098829, CurrSamplesPerSec=2.6370143740535013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:58:39,581] [INFO] [logging.py:96:log_dist] [Rank 0] step=11170, skipped=0, lr=[9.709252700856706e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:58:39,585] [INFO] [timer.py:259:stop] epoch=0/micro_step=11170/global_step=11170, RunningAvgSamplesPerSec=2.6297113160447187, CurrSamplesPerSec=2.6363223716747273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:58:54,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=11180, skipped=0, lr=[9.708686265999397e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:58:54,782] [INFO] [timer.py:259:stop] epoch=0/micro_step=11180/global_step=11180, RunningAvgSamplesPerSec=2.6297260671909766, CurrSamplesPerSec=2.651834081555698, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:59:10,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=11190, skipped=0, lr=[9.708119296469241e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:59:10,003] [INFO] [timer.py:259:stop] epoch=0/micro_step=11190/global_step=11190, RunningAvgSamplesPerSec=2.629736308620147, CurrSamplesPerSec=2.6674344071417107, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:59:25,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=11200, skipped=0, lr=[9.70755179233062e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:59:25,201] [INFO] [timer.py:259:stop] epoch=0/micro_step=11200/global_step=11200, RunningAvgSamplesPerSec=2.629749892938353, CurrSamplesPerSec=2.653013684028157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:59:40,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=11210, skipped=0, lr=[9.706983753647975e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:59:40,408] [INFO] [timer.py:259:stop] epoch=0/micro_step=11210/global_step=11210, RunningAvgSamplesPerSec=2.6297604494090274, CurrSamplesPerSec=2.6640036207203437, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 04:59:55,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=11220, skipped=0, lr=[9.706415180485806e-06], mom=[(0.9, 0.95)] +[2024-11-01 04:59:55,609] [INFO] [timer.py:259:stop] epoch=0/micro_step=11220/global_step=11220, RunningAvgSamplesPerSec=2.6297736716031648, CurrSamplesPerSec=2.672293667359006, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:00:10,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=11230, skipped=0, lr=[9.705846072908673e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:00:10,873] [INFO] [timer.py:259:stop] epoch=0/micro_step=11230/global_step=11230, RunningAvgSamplesPerSec=2.6297772167168567, CurrSamplesPerSec=2.6454556971849725, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:00:26,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=11240, skipped=0, lr=[9.705276430981202e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:00:26,080] [INFO] [timer.py:259:stop] epoch=0/micro_step=11240/global_step=11240, RunningAvgSamplesPerSec=2.6297889940106187, CurrSamplesPerSec=2.6481080224570976, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:00:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=11250, skipped=0, lr=[9.704706254768071e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:00:41,291] [INFO] [timer.py:259:stop] epoch=0/micro_step=11250/global_step=11250, RunningAvgSamplesPerSec=2.629800374130016, CurrSamplesPerSec=2.643381639468043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:00:56,450] [INFO] [logging.py:96:log_dist] [Rank 0] step=11260, skipped=0, lr=[9.704135544334028e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:00:56,452] [INFO] [timer.py:259:stop] epoch=0/micro_step=11260/global_step=11260, RunningAvgSamplesPerSec=2.6298217923382454, CurrSamplesPerSec=2.6350131196126063, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:01:11,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=11270, skipped=0, lr=[9.703564299743875e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:01:11,752] [INFO] [timer.py:259:stop] epoch=0/micro_step=11270/global_step=11270, RunningAvgSamplesPerSec=2.6298220732124737, CurrSamplesPerSec=2.6387372548964794, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:01:26,984] [INFO] [logging.py:96:log_dist] [Rank 0] step=11280, skipped=0, lr=[9.70299252106248e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:01:26,991] [INFO] [timer.py:259:stop] epoch=0/micro_step=11280/global_step=11280, RunningAvgSamplesPerSec=2.6298343751283815, CurrSamplesPerSec=2.6404619729173446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:01:42,228] [INFO] [logging.py:96:log_dist] [Rank 0] step=11290, skipped=0, lr=[9.702420208354765e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:01:42,230] [INFO] [timer.py:259:stop] epoch=0/micro_step=11290/global_step=11290, RunningAvgSamplesPerSec=2.629845574899304, CurrSamplesPerSec=2.634840968186767, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:01:57,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=11300, skipped=0, lr=[9.701847361685717e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:01:57,432] [INFO] [timer.py:259:stop] epoch=0/micro_step=11300/global_step=11300, RunningAvgSamplesPerSec=2.629858700520956, CurrSamplesPerSec=2.6366144601123507, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:02:12,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=11310, skipped=0, lr=[9.701273981120385e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:02:12,601] [INFO] [timer.py:259:stop] epoch=0/micro_step=11310/global_step=11310, RunningAvgSamplesPerSec=2.6298758148138632, CurrSamplesPerSec=2.6781082712580355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:02:27,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=11320, skipped=0, lr=[9.700700066723875e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:02:27,809] [INFO] [timer.py:259:stop] epoch=0/micro_step=11320/global_step=11320, RunningAvgSamplesPerSec=2.6298863399581904, CurrSamplesPerSec=2.592907005393585, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:02:43,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=11330, skipped=0, lr=[9.700125618561355e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:02:43,046] [INFO] [timer.py:259:stop] epoch=0/micro_step=11330/global_step=11330, RunningAvgSamplesPerSec=2.629894126751746, CurrSamplesPerSec=2.6536989472613457, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:02:58,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=11340, skipped=0, lr=[9.699550636698054e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:02:58,192] [INFO] [timer.py:259:stop] epoch=0/micro_step=11340/global_step=11340, RunningAvgSamplesPerSec=2.6299150741698405, CurrSamplesPerSec=2.6123460990110683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:03:13,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=11350, skipped=0, lr=[9.698975121199264e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:03:13,356] [INFO] [timer.py:259:stop] epoch=0/micro_step=11350/global_step=11350, RunningAvgSamplesPerSec=2.629932147523595, CurrSamplesPerSec=2.645122861918075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:03:28,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=11360, skipped=0, lr=[9.69839907213033e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:03:28,574] [INFO] [timer.py:259:stop] epoch=0/micro_step=11360/global_step=11360, RunningAvgSamplesPerSec=2.6299419446116015, CurrSamplesPerSec=2.5815735684221965, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:03:43,743] [INFO] [logging.py:96:log_dist] [Rank 0] step=11370, skipped=0, lr=[9.697822489556665e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:03:43,752] [INFO] [timer.py:259:stop] epoch=0/micro_step=11370/global_step=11370, RunningAvgSamplesPerSec=2.6299583393483523, CurrSamplesPerSec=2.6519451618099588, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:03:58,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=11380, skipped=0, lr=[9.697245373543743e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:03:58,901] [INFO] [timer.py:259:stop] epoch=0/micro_step=11380/global_step=11380, RunningAvgSamplesPerSec=2.629978236487139, CurrSamplesPerSec=2.6080698950087884, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:04:14,084] [INFO] [logging.py:96:log_dist] [Rank 0] step=11390, skipped=0, lr=[9.696667724157089e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:04:14,091] [INFO] [timer.py:259:stop] epoch=0/micro_step=11390/global_step=11390, RunningAvgSamplesPerSec=2.629989917728553, CurrSamplesPerSec=2.6509248309501654, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:04:29,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=11400, skipped=0, lr=[9.696089541462302e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:04:29,300] [INFO] [timer.py:259:stop] epoch=0/micro_step=11400/global_step=11400, RunningAvgSamplesPerSec=2.630000262956555, CurrSamplesPerSec=2.6360170945031234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:04:44,512] [INFO] [logging.py:96:log_dist] [Rank 0] step=11410, skipped=0, lr=[9.695510825525032e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:04:44,513] [INFO] [timer.py:259:stop] epoch=0/micro_step=11410/global_step=11410, RunningAvgSamplesPerSec=2.63001043660713, CurrSamplesPerSec=2.650035039828612, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:04:59,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=11420, skipped=0, lr=[9.69493157641099e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:04:59,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=11420/global_step=11420, RunningAvgSamplesPerSec=2.630019691033585, CurrSamplesPerSec=2.6232462713454026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:05:14,985] [INFO] [logging.py:96:log_dist] [Rank 0] step=11430, skipped=0, lr=[9.694351794185954e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:05:14,986] [INFO] [timer.py:259:stop] epoch=0/micro_step=11430/global_step=11430, RunningAvgSamplesPerSec=2.6300252438477902, CurrSamplesPerSec=2.651847075368978, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:05:30,216] [INFO] [logging.py:96:log_dist] [Rank 0] step=11440, skipped=0, lr=[9.693771478915758e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:05:30,217] [INFO] [timer.py:259:stop] epoch=0/micro_step=11440/global_step=11440, RunningAvgSamplesPerSec=2.6300359703365115, CurrSamplesPerSec=2.642025009963406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:05:45,450] [INFO] [logging.py:96:log_dist] [Rank 0] step=11450, skipped=0, lr=[9.693190630666295e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:05:45,452] [INFO] [timer.py:259:stop] epoch=0/micro_step=11450/global_step=11450, RunningAvgSamplesPerSec=2.6300463578960867, CurrSamplesPerSec=2.6500861081445652, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:06:00,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=11460, skipped=0, lr=[9.692609249503521e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:06:00,655] [INFO] [timer.py:259:stop] epoch=0/micro_step=11460/global_step=11460, RunningAvgSamplesPerSec=2.6300593815040907, CurrSamplesPerSec=2.676311838564797, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:06:15,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=11470, skipped=0, lr=[9.692027335493452e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:06:15,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=11470/global_step=11470, RunningAvgSamplesPerSec=2.63006911525013, CurrSamplesPerSec=2.637558700532346, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:06:31,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=11480, skipped=0, lr=[9.691444888702166e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:06:31,086] [INFO] [timer.py:259:stop] epoch=0/micro_step=11480/global_step=11480, RunningAvgSamplesPerSec=2.630078246411512, CurrSamplesPerSec=2.6637413808860546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:06:46,228] [INFO] [logging.py:96:log_dist] [Rank 0] step=11490, skipped=0, lr=[9.6908619091958e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:06:46,229] [INFO] [timer.py:259:stop] epoch=0/micro_step=11490/global_step=11490, RunningAvgSamplesPerSec=2.630098393252383, CurrSamplesPerSec=2.6560793441328996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:07:01,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=11500, skipped=0, lr=[9.690278397040547e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:07:01,557] [INFO] [timer.py:259:stop] epoch=0/micro_step=11500/global_step=11500, RunningAvgSamplesPerSec=2.6300923321605576, CurrSamplesPerSec=2.60386262764949, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:07:16,755] [INFO] [logging.py:96:log_dist] [Rank 0] step=11510, skipped=0, lr=[9.68969435230267e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:07:16,756] [INFO] [timer.py:259:stop] epoch=0/micro_step=11510/global_step=11510, RunningAvgSamplesPerSec=2.630103359818444, CurrSamplesPerSec=2.650047597428735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:07:31,986] [INFO] [logging.py:96:log_dist] [Rank 0] step=11520, skipped=0, lr=[9.689109775048486e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:07:31,993] [INFO] [timer.py:259:stop] epoch=0/micro_step=11520/global_step=11520, RunningAvgSamplesPerSec=2.6301104985405237, CurrSamplesPerSec=2.637883828228064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:07:47,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=11530, skipped=0, lr=[9.688524665344374e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:07:47,162] [INFO] [timer.py:259:stop] epoch=0/micro_step=11530/global_step=11530, RunningAvgSamplesPerSec=2.6301257329570915, CurrSamplesPerSec=2.655040704053086, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:08:02,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=11540, skipped=0, lr=[9.687939023256774e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:08:02,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=11540/global_step=11540, RunningAvgSamplesPerSec=2.6301314017365787, CurrSamplesPerSec=2.6466567690952965, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:08:17,573] [INFO] [logging.py:96:log_dist] [Rank 0] step=11550, skipped=0, lr=[9.687352848852183e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:08:17,582] [INFO] [timer.py:259:stop] epoch=0/micro_step=11550/global_step=11550, RunningAvgSamplesPerSec=2.6301458205306725, CurrSamplesPerSec=2.650542042865822, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:08:32,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=11560, skipped=0, lr=[9.686766142197164e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:08:32,830] [INFO] [timer.py:259:stop] epoch=0/micro_step=11560/global_step=11560, RunningAvgSamplesPerSec=2.6301502462752993, CurrSamplesPerSec=2.6315240023189426, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:08:48,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=11570, skipped=0, lr=[9.686178903358338e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:08:48,013] [INFO] [timer.py:259:stop] epoch=0/micro_step=11570/global_step=11570, RunningAvgSamplesPerSec=2.6301673664807246, CurrSamplesPerSec=2.644645443800361, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:09:03,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=11580, skipped=0, lr=[9.685591132402384e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:09:03,264] [INFO] [timer.py:259:stop] epoch=0/micro_step=11580/global_step=11580, RunningAvgSamplesPerSec=2.630175177251915, CurrSamplesPerSec=2.640357254455284, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:09:18,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=11590, skipped=0, lr=[9.685002829396046e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:09:18,494] [INFO] [timer.py:259:stop] epoch=0/micro_step=11590/global_step=11590, RunningAvgSamplesPerSec=2.6301833839719873, CurrSamplesPerSec=2.574048471923193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:09:33,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=11600, skipped=0, lr=[9.684413994406125e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:09:33,730] [INFO] [timer.py:259:stop] epoch=0/micro_step=11600/global_step=11600, RunningAvgSamplesPerSec=2.630193899815565, CurrSamplesPerSec=2.653703984186282, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:09:49,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=11610, skipped=0, lr=[9.683824627499483e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:09:49,077] [INFO] [timer.py:259:stop] epoch=0/micro_step=11610/global_step=11610, RunningAvgSamplesPerSec=2.6301848930502176, CurrSamplesPerSec=2.619007517255281, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:10:04,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=11620, skipped=0, lr=[9.68323472874304e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:10:04,300] [INFO] [timer.py:259:stop] epoch=0/micro_step=11620/global_step=11620, RunningAvgSamplesPerSec=2.630194115287776, CurrSamplesPerSec=2.6198049984059026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:10:19,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=11630, skipped=0, lr=[9.682644298203787e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:10:19,612] [INFO] [timer.py:259:stop] epoch=0/micro_step=11630/global_step=11630, RunningAvgSamplesPerSec=2.6301884662537205, CurrSamplesPerSec=2.6289023058412475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:10:34,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=11640, skipped=0, lr=[9.68205333594876e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:10:34,799] [INFO] [timer.py:259:stop] epoch=0/micro_step=11640/global_step=11640, RunningAvgSamplesPerSec=2.630201447319907, CurrSamplesPerSec=2.668556197380732, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:10:50,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=11650, skipped=0, lr=[9.681461842045065e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:10:50,004] [INFO] [timer.py:259:stop] epoch=0/micro_step=11650/global_step=11650, RunningAvgSamplesPerSec=2.6302126928707854, CurrSamplesPerSec=2.652223112801178, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:11:05,176] [INFO] [logging.py:96:log_dist] [Rank 0] step=11660, skipped=0, lr=[9.680869816559869e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:11:05,187] [INFO] [timer.py:259:stop] epoch=0/micro_step=11660/global_step=11660, RunningAvgSamplesPerSec=2.630227595501124, CurrSamplesPerSec=2.6348989012514714, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:11:20,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=11670, skipped=0, lr=[9.680277259560394e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:11:20,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=11670/global_step=11670, RunningAvgSamplesPerSec=2.6302412697229376, CurrSamplesPerSec=2.649585974569047, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:11:35,462] [INFO] [logging.py:96:log_dist] [Rank 0] step=11680, skipped=0, lr=[9.679684171113926e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:11:35,463] [INFO] [timer.py:259:stop] epoch=0/micro_step=11680/global_step=11680, RunningAvgSamplesPerSec=2.630267160606514, CurrSamplesPerSec=2.6735409661499188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:11:50,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=11690, skipped=0, lr=[9.67909055128781e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:11:50,683] [INFO] [timer.py:259:stop] epoch=0/micro_step=11690/global_step=11690, RunningAvgSamplesPerSec=2.6302781644960085, CurrSamplesPerSec=2.656791014529719, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:12:05,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=11700, skipped=0, lr=[9.678496400149452e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:12:05,858] [INFO] [timer.py:259:stop] epoch=0/micro_step=11700/global_step=11700, RunningAvgSamplesPerSec=2.6302945265952515, CurrSamplesPerSec=2.6713265243459174, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:12:21,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=11710, skipped=0, lr=[9.677901717766318e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:12:21,002] [INFO] [timer.py:259:stop] epoch=0/micro_step=11710/global_step=11710, RunningAvgSamplesPerSec=2.6303138411375695, CurrSamplesPerSec=2.6571701388564155, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:12:36,197] [INFO] [logging.py:96:log_dist] [Rank 0] step=11720, skipped=0, lr=[9.677306504205934e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:12:36,211] [INFO] [timer.py:259:stop] epoch=0/micro_step=11720/global_step=11720, RunningAvgSamplesPerSec=2.630326475075832, CurrSamplesPerSec=2.65799145492018, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:12:51,425] [INFO] [logging.py:96:log_dist] [Rank 0] step=11730, skipped=0, lr=[9.676710759535888e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:12:51,426] [INFO] [timer.py:259:stop] epoch=0/micro_step=11730/global_step=11730, RunningAvgSamplesPerSec=2.630336226175383, CurrSamplesPerSec=2.6495236280536307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:13:06,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=11740, skipped=0, lr=[9.676114483823824e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:13:06,633] [INFO] [timer.py:259:stop] epoch=0/micro_step=11740/global_step=11740, RunningAvgSamplesPerSec=2.630348593370167, CurrSamplesPerSec=2.6555164188996105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:13:21,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=11750, skipped=0, lr=[9.675517677137453e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:13:21,851] [INFO] [timer.py:259:stop] epoch=0/micro_step=11750/global_step=11750, RunningAvgSamplesPerSec=2.6303569049751068, CurrSamplesPerSec=2.658513723953595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:13:37,037] [INFO] [logging.py:96:log_dist] [Rank 0] step=11760, skipped=0, lr=[9.67492033954454e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:13:37,039] [INFO] [timer.py:259:stop] epoch=0/micro_step=11760/global_step=11760, RunningAvgSamplesPerSec=2.630369412650723, CurrSamplesPerSec=2.6537077618925324, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:13:52,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=11770, skipped=0, lr=[9.674322471112913e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:13:52,234] [INFO] [timer.py:259:stop] epoch=0/micro_step=11770/global_step=11770, RunningAvgSamplesPerSec=2.6303819259966366, CurrSamplesPerSec=2.640521815768019, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:14:07,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=11780, skipped=0, lr=[9.673724071910464e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:14:07,435] [INFO] [timer.py:259:stop] epoch=0/micro_step=11780/global_step=11780, RunningAvgSamplesPerSec=2.6303955225414444, CurrSamplesPerSec=2.6560553760593044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:14:22,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=11790, skipped=0, lr=[9.673125142005135e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:14:22,648] [INFO] [timer.py:259:stop] epoch=0/micro_step=11790/global_step=11790, RunningAvgSamplesPerSec=2.630403420352417, CurrSamplesPerSec=2.6409898471631736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:14:37,820] [INFO] [logging.py:96:log_dist] [Rank 0] step=11800, skipped=0, lr=[9.672525681464941e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:14:37,821] [INFO] [timer.py:259:stop] epoch=0/micro_step=11800/global_step=11800, RunningAvgSamplesPerSec=2.630416517089466, CurrSamplesPerSec=2.6439802646064456, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:14:52,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=11810, skipped=0, lr=[9.671925690357945e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:14:52,966] [INFO] [timer.py:259:stop] epoch=0/micro_step=11810/global_step=11810, RunningAvgSamplesPerSec=2.630435104610191, CurrSamplesPerSec=2.6418411250945706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:15:08,154] [INFO] [logging.py:96:log_dist] [Rank 0] step=11820, skipped=0, lr=[9.671325168752282e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:15:08,155] [INFO] [timer.py:259:stop] epoch=0/micro_step=11820/global_step=11820, RunningAvgSamplesPerSec=2.6304474394153656, CurrSamplesPerSec=2.6107647484400065, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:15:23,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=11830, skipped=0, lr=[9.670724116716137e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:15:23,325] [INFO] [timer.py:259:stop] epoch=0/micro_step=11830/global_step=11830, RunningAvgSamplesPerSec=2.6304647856622787, CurrSamplesPerSec=2.656409474766811, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:15:38,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=11840, skipped=0, lr=[9.67012253431776e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:15:38,529] [INFO] [timer.py:259:stop] epoch=0/micro_step=11840/global_step=11840, RunningAvgSamplesPerSec=2.630475176295182, CurrSamplesPerSec=2.608728078011381, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:15:53,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=11850, skipped=0, lr=[9.669520421625465e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:15:53,752] [INFO] [timer.py:259:stop] epoch=0/micro_step=11850/global_step=11850, RunningAvgSamplesPerSec=2.630482648527902, CurrSamplesPerSec=2.599747868582565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:16:08,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=11860, skipped=0, lr=[9.668917778707619e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:16:08,972] [INFO] [timer.py:259:stop] epoch=0/micro_step=11860/global_step=11860, RunningAvgSamplesPerSec=2.630494085517265, CurrSamplesPerSec=2.629449881872556, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:16:24,105] [INFO] [logging.py:96:log_dist] [Rank 0] step=11870, skipped=0, lr=[9.66831460563265e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:16:24,110] [INFO] [timer.py:259:stop] epoch=0/micro_step=11870/global_step=11870, RunningAvgSamplesPerSec=2.6305130182510075, CurrSamplesPerSec=2.657950608657575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:16:39,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=11880, skipped=0, lr=[9.667710902469053e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:16:39,342] [INFO] [timer.py:259:stop] epoch=0/micro_step=11880/global_step=11880, RunningAvgSamplesPerSec=2.6305224364146453, CurrSamplesPerSec=2.6546525260316054, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:16:54,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=11890, skipped=0, lr=[9.667106669285376e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:16:54,518] [INFO] [timer.py:259:stop] epoch=0/micro_step=11890/global_step=11890, RunningAvgSamplesPerSec=2.630539262192766, CurrSamplesPerSec=2.6530850053987654, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:17:09,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=11900, skipped=0, lr=[9.66650190615023e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:17:09,691] [INFO] [timer.py:259:stop] epoch=0/micro_step=11900/global_step=11900, RunningAvgSamplesPerSec=2.6305539249411547, CurrSamplesPerSec=2.6504796513489755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:17:24,833] [INFO] [logging.py:96:log_dist] [Rank 0] step=11910, skipped=0, lr=[9.665896613132287e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:17:24,835] [INFO] [timer.py:259:stop] epoch=0/micro_step=11910/global_step=11910, RunningAvgSamplesPerSec=2.6305720383973945, CurrSamplesPerSec=2.65086325910369, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:17:39,998] [INFO] [logging.py:96:log_dist] [Rank 0] step=11920, skipped=0, lr=[9.66529079030028e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:17:40,000] [INFO] [timer.py:259:stop] epoch=0/micro_step=11920/global_step=11920, RunningAvgSamplesPerSec=2.6305871805726446, CurrSamplesPerSec=2.640084277517504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:17:55,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=11930, skipped=0, lr=[9.664684437722995e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:17:55,151] [INFO] [timer.py:259:stop] epoch=0/micro_step=11930/global_step=11930, RunningAvgSamplesPerSec=2.630606930131416, CurrSamplesPerSec=2.6808397884600743, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:18:10,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=11940, skipped=0, lr=[9.664077555469287e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:18:10,240] [INFO] [timer.py:259:stop] epoch=0/micro_step=11940/global_step=11940, RunningAvgSamplesPerSec=2.630633282463132, CurrSamplesPerSec=2.6395737918122903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:18:25,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=11950, skipped=0, lr=[9.66347014360807e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:18:25,324] [INFO] [timer.py:259:stop] epoch=0/micro_step=11950/global_step=11950, RunningAvgSamplesPerSec=2.6306604758044725, CurrSamplesPerSec=2.663367144103998, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:18:40,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=11960, skipped=0, lr=[9.662862202208311e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:18:40,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=11960/global_step=11960, RunningAvgSamplesPerSec=2.6306644942848214, CurrSamplesPerSec=2.6228349407048186, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:18:55,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=11970, skipped=0, lr=[9.662253731339043e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:18:55,723] [INFO] [timer.py:259:stop] epoch=0/micro_step=11970/global_step=11970, RunningAvgSamplesPerSec=2.6306818064174813, CurrSamplesPerSec=2.6608631006109036, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:19:10,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=11980, skipped=0, lr=[9.661644731069362e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:19:10,891] [INFO] [timer.py:259:stop] epoch=0/micro_step=11980/global_step=11980, RunningAvgSamplesPerSec=2.6306956172146405, CurrSamplesPerSec=2.6558363195017565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:19:26,072] [INFO] [logging.py:96:log_dist] [Rank 0] step=11990, skipped=0, lr=[9.661035201468416e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:19:26,086] [INFO] [timer.py:259:stop] epoch=0/micro_step=11990/global_step=11990, RunningAvgSamplesPerSec=2.630706227396345, CurrSamplesPerSec=2.6578693409616423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:19:41,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=12000, skipped=0, lr=[9.66042514260542e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:19:41,273] [INFO] [timer.py:259:stop] epoch=0/micro_step=12000/global_step=12000, RunningAvgSamplesPerSec=2.6307175711003943, CurrSamplesPerSec=2.6411790188215005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:19:56,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=12010, skipped=0, lr=[9.659814554549645e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:19:56,425] [INFO] [timer.py:259:stop] epoch=0/micro_step=12010/global_step=12010, RunningAvgSamplesPerSec=2.6307340827355516, CurrSamplesPerSec=2.663338816335583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:20:11,621] [INFO] [logging.py:96:log_dist] [Rank 0] step=12020, skipped=0, lr=[9.659203437370421e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:20:11,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=12020/global_step=12020, RunningAvgSamplesPerSec=2.6307441677477246, CurrSamplesPerSec=2.6389605562869685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:20:26,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=12030, skipped=0, lr=[9.658591791137146e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:20:26,800] [INFO] [timer.py:259:stop] epoch=0/micro_step=12030/global_step=12030, RunningAvgSamplesPerSec=2.630758772344043, CurrSamplesPerSec=2.6379796400698576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:20:41,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=12040, skipped=0, lr=[9.657979615919268e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:20:42,004] [INFO] [timer.py:259:stop] epoch=0/micro_step=12040/global_step=12040, RunningAvgSamplesPerSec=2.630768573822982, CurrSamplesPerSec=2.671120676677958, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:20:57,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=12050, skipped=0, lr=[9.657366911786302e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:20:57,147] [INFO] [timer.py:259:stop] epoch=0/micro_step=12050/global_step=12050, RunningAvgSamplesPerSec=2.6307875651062176, CurrSamplesPerSec=2.6759489998858634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:21:12,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=12060, skipped=0, lr=[9.65675367880782e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:21:12,351] [INFO] [timer.py:259:stop] epoch=0/micro_step=12060/global_step=12060, RunningAvgSamplesPerSec=2.6307971627871716, CurrSamplesPerSec=2.6528274274743215, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:21:27,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=12070, skipped=0, lr=[9.656139917053455e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:21:27,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=12070/global_step=12070, RunningAvgSamplesPerSec=2.630809285119868, CurrSamplesPerSec=2.6573906782426002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:21:42,734] [INFO] [logging.py:96:log_dist] [Rank 0] step=12080, skipped=0, lr=[9.655525626592902e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:21:42,734] [INFO] [timer.py:259:stop] epoch=0/micro_step=12080/global_step=12080, RunningAvgSamplesPerSec=2.630819055521612, CurrSamplesPerSec=2.6450865804294326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:21:57,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=12090, skipped=0, lr=[9.654910807495908e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:21:57,886] [INFO] [timer.py:259:stop] epoch=0/micro_step=12090/global_step=12090, RunningAvgSamplesPerSec=2.6308351879593133, CurrSamplesPerSec=2.639338345421192, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:22:13,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=12100, skipped=0, lr=[9.654295459832291e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:22:13,090] [INFO] [timer.py:259:stop] epoch=0/micro_step=12100/global_step=12100, RunningAvgSamplesPerSec=2.6308438586323364, CurrSamplesPerSec=2.623615881768964, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:22:28,225] [INFO] [logging.py:96:log_dist] [Rank 0] step=12110, skipped=0, lr=[9.653679583671923e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:22:28,234] [INFO] [timer.py:259:stop] epoch=0/micro_step=12110/global_step=12110, RunningAvgSamplesPerSec=2.6308611881917114, CurrSamplesPerSec=2.640078876716826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:22:43,390] [INFO] [logging.py:96:log_dist] [Rank 0] step=12120, skipped=0, lr=[9.653063179084737e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:22:43,397] [INFO] [timer.py:259:stop] epoch=0/micro_step=12120/global_step=12120, RunningAvgSamplesPerSec=2.6308774537692305, CurrSamplesPerSec=2.6353993006599605, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:22:58,549] [INFO] [logging.py:96:log_dist] [Rank 0] step=12130, skipped=0, lr=[9.652446246140724e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:22:58,587] [INFO] [timer.py:259:stop] epoch=0/micro_step=12130/global_step=12130, RunningAvgSamplesPerSec=2.630891130965579, CurrSamplesPerSec=2.6166106385499273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:23:13,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=12140, skipped=0, lr=[9.65182878490994e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:23:13,750] [INFO] [timer.py:259:stop] epoch=0/micro_step=12140/global_step=12140, RunningAvgSamplesPerSec=2.630906765582474, CurrSamplesPerSec=2.651768695266698, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:23:28,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=12150, skipped=0, lr=[9.651210795462494e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:23:28,946] [INFO] [timer.py:259:stop] epoch=0/micro_step=12150/global_step=12150, RunningAvgSamplesPerSec=2.630918585763651, CurrSamplesPerSec=2.6603259873244114, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:23:44,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=12160, skipped=0, lr=[9.650592277868562e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:23:44,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=12160/global_step=12160, RunningAvgSamplesPerSec=2.630933113793292, CurrSamplesPerSec=2.6365730251891586, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:23:59,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=12170, skipped=0, lr=[9.649973232198377e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:23:59,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=12170/global_step=12170, RunningAvgSamplesPerSec=2.63094357829837, CurrSamplesPerSec=2.625601909563683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:24:14,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=12180, skipped=0, lr=[9.64935365852223e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:24:14,395] [INFO] [timer.py:259:stop] epoch=0/micro_step=12180/global_step=12180, RunningAvgSamplesPerSec=2.630968506480294, CurrSamplesPerSec=2.6770647224518735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:24:29,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=12190, skipped=0, lr=[9.648733556910476e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:24:29,503] [INFO] [timer.py:259:stop] epoch=0/micro_step=12190/global_step=12190, RunningAvgSamplesPerSec=2.630991404090597, CurrSamplesPerSec=2.6537837380459792, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:24:44,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=12200, skipped=0, lr=[9.648112927433526e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:24:44,668] [INFO] [timer.py:259:stop] epoch=0/micro_step=12200/global_step=12200, RunningAvgSamplesPerSec=2.6310077142577133, CurrSamplesPerSec=2.6418556851803947, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:24:59,891] [INFO] [logging.py:96:log_dist] [Rank 0] step=12210, skipped=0, lr=[9.647491770161852e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:24:59,899] [INFO] [timer.py:259:stop] epoch=0/micro_step=12210/global_step=12210, RunningAvgSamplesPerSec=2.6310153223405432, CurrSamplesPerSec=2.6312611021787955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:25:15,066] [INFO] [logging.py:96:log_dist] [Rank 0] step=12220, skipped=0, lr=[9.646870085165987e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:25:15,067] [INFO] [timer.py:259:stop] epoch=0/micro_step=12220/global_step=12220, RunningAvgSamplesPerSec=2.631032856486607, CurrSamplesPerSec=2.676116747187308, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:25:30,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=12230, skipped=0, lr=[9.646247872516526e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:25:30,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=12230/global_step=12230, RunningAvgSamplesPerSec=2.6310446962863003, CurrSamplesPerSec=2.628162266506292, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:25:45,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=12240, skipped=0, lr=[9.645625132284118e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:25:45,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=12240/global_step=12240, RunningAvgSamplesPerSec=2.631062838958688, CurrSamplesPerSec=2.6330446529554385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:26:00,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=12250, skipped=0, lr=[9.645001864539478e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:26:00,590] [INFO] [timer.py:259:stop] epoch=0/micro_step=12250/global_step=12250, RunningAvgSamplesPerSec=2.631073624897509, CurrSamplesPerSec=2.643979431258477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:26:15,769] [INFO] [logging.py:96:log_dist] [Rank 0] step=12260, skipped=0, lr=[9.644378069353379e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:26:15,770] [INFO] [timer.py:259:stop] epoch=0/micro_step=12260/global_step=12260, RunningAvgSamplesPerSec=2.6310855665555466, CurrSamplesPerSec=2.6648537172156304, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:26:30,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=12270, skipped=0, lr=[9.64375374679665e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:26:30,952] [INFO] [timer.py:259:stop] epoch=0/micro_step=12270/global_step=12270, RunningAvgSamplesPerSec=2.631097693887422, CurrSamplesPerSec=2.6481966363654275, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:26:46,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=12280, skipped=0, lr=[9.643128896940184e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:26:46,118] [INFO] [timer.py:259:stop] epoch=0/micro_step=12280/global_step=12280, RunningAvgSamplesPerSec=2.631112666815814, CurrSamplesPerSec=2.650341479251009, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:27:01,366] [INFO] [logging.py:96:log_dist] [Rank 0] step=12290, skipped=0, lr=[9.642503519854933e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:27:01,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=12290/global_step=12290, RunningAvgSamplesPerSec=2.6311210930513473, CurrSamplesPerSec=2.6514049381243674, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:27:16,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=12300, skipped=0, lr=[9.64187761561191e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:27:16,523] [INFO] [timer.py:259:stop] epoch=0/micro_step=12300/global_step=12300, RunningAvgSamplesPerSec=2.6311370295611503, CurrSamplesPerSec=2.6465607434067397, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:27:31,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=12310, skipped=0, lr=[9.641251184282185e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:27:31,749] [INFO] [timer.py:259:stop] epoch=0/micro_step=12310/global_step=12310, RunningAvgSamplesPerSec=2.631143878759771, CurrSamplesPerSec=2.6610281176749657, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:27:46,794] [INFO] [logging.py:96:log_dist] [Rank 0] step=12320, skipped=0, lr=[9.64062422593689e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:27:46,795] [INFO] [timer.py:259:stop] epoch=0/micro_step=12320/global_step=12320, RunningAvgSamplesPerSec=2.63117413443096, CurrSamplesPerSec=2.676201269062268, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:28:01,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=12330, skipped=0, lr=[9.639996740647217e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:28:01,962] [INFO] [timer.py:259:stop] epoch=0/micro_step=12330/global_step=12330, RunningAvgSamplesPerSec=2.6311886669967435, CurrSamplesPerSec=2.657989349412038, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:28:17,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=12340, skipped=0, lr=[9.639368728484417e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:28:17,162] [INFO] [timer.py:259:stop] epoch=0/micro_step=12340/global_step=12340, RunningAvgSamplesPerSec=2.6311995467782294, CurrSamplesPerSec=2.64812014382084, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:28:32,307] [INFO] [logging.py:96:log_dist] [Rank 0] step=12350, skipped=0, lr=[9.638740189519803e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:28:32,308] [INFO] [timer.py:259:stop] epoch=0/micro_step=12350/global_step=12350, RunningAvgSamplesPerSec=2.6312178561589814, CurrSamplesPerSec=2.6553016537241594, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:28:47,521] [INFO] [logging.py:96:log_dist] [Rank 0] step=12360, skipped=0, lr=[9.63811112382474e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:28:47,522] [INFO] [timer.py:259:stop] epoch=0/micro_step=12360/global_step=12360, RunningAvgSamplesPerSec=2.6312275793770077, CurrSamplesPerSec=2.658176331539697, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:29:02,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=12370, skipped=0, lr=[9.637481531470662e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:29:02,670] [INFO] [timer.py:259:stop] epoch=0/micro_step=12370/global_step=12370, RunningAvgSamplesPerSec=2.6312459097112804, CurrSamplesPerSec=2.6767294382371825, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:29:17,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=12380, skipped=0, lr=[9.63685141252906e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:29:17,823] [INFO] [timer.py:259:stop] epoch=0/micro_step=12380/global_step=12380, RunningAvgSamplesPerSec=2.63126243282396, CurrSamplesPerSec=2.6010533525522197, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:29:33,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=12390, skipped=0, lr=[9.636220767071486e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:29:33,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=12390/global_step=12390, RunningAvgSamplesPerSec=2.6312723578113086, CurrSamplesPerSec=2.653976426361012, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:29:48,204] [INFO] [logging.py:96:log_dist] [Rank 0] step=12400, skipped=0, lr=[9.635589595169545e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:29:48,206] [INFO] [timer.py:259:stop] epoch=0/micro_step=12400/global_step=12400, RunningAvgSamplesPerSec=2.6312831089023097, CurrSamplesPerSec=2.6076256166865868, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:30:03,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=12410, skipped=0, lr=[9.634957896894911e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:30:03,373] [INFO] [timer.py:259:stop] epoch=0/micro_step=12410/global_step=12410, RunningAvgSamplesPerSec=2.6312980597430724, CurrSamplesPerSec=2.6385878549944706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:30:18,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=12420, skipped=0, lr=[9.634325672319312e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:30:18,609] [INFO] [timer.py:259:stop] epoch=0/micro_step=12420/global_step=12420, RunningAvgSamplesPerSec=2.631303498734039, CurrSamplesPerSec=2.622890296737825, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:30:33,811] [INFO] [logging.py:96:log_dist] [Rank 0] step=12430, skipped=0, lr=[9.633692921514537e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:30:33,813] [INFO] [timer.py:259:stop] epoch=0/micro_step=12430/global_step=12430, RunningAvgSamplesPerSec=2.631314218395364, CurrSamplesPerSec=2.6440515178003854, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:30:48,942] [INFO] [logging.py:96:log_dist] [Rank 0] step=12440, skipped=0, lr=[9.633059644552436e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:30:48,944] [INFO] [timer.py:259:stop] epoch=0/micro_step=12440/global_step=12440, RunningAvgSamplesPerSec=2.631333204692329, CurrSamplesPerSec=2.6468839184580064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:31:04,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=12450, skipped=0, lr=[9.632425841504918e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:31:04,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=12450/global_step=12450, RunningAvgSamplesPerSec=2.6313467973151647, CurrSamplesPerSec=2.6593666428490357, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:31:19,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=12460, skipped=0, lr=[9.631791512443949e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:31:19,262] [INFO] [timer.py:259:stop] epoch=0/micro_step=12460/global_step=12460, RunningAvgSamplesPerSec=2.6313625972947974, CurrSamplesPerSec=2.627872870884678, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:31:34,376] [INFO] [logging.py:96:log_dist] [Rank 0] step=12470, skipped=0, lr=[9.631156657441557e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:31:34,377] [INFO] [timer.py:259:stop] epoch=0/micro_step=12470/global_step=12470, RunningAvgSamplesPerSec=2.631383456611493, CurrSamplesPerSec=2.6774829844294556, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:31:49,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=12480, skipped=0, lr=[9.630521276569836e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:31:49,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=12480/global_step=12480, RunningAvgSamplesPerSec=2.631393992191299, CurrSamplesPerSec=2.64473716141659, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:32:04,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=12490, skipped=0, lr=[9.62988536990093e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:32:04,722] [INFO] [timer.py:259:stop] epoch=0/micro_step=12490/global_step=12490, RunningAvgSamplesPerSec=2.6314083055445514, CurrSamplesPerSec=2.6632584871352987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:32:19,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=12500, skipped=0, lr=[9.629248937507042e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:32:19,875] [INFO] [timer.py:259:stop] epoch=0/micro_step=12500/global_step=12500, RunningAvgSamplesPerSec=2.631423999805243, CurrSamplesPerSec=2.6286304566642826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:32:35,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=12510, skipped=0, lr=[9.628611979460448e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:32:35,037] [INFO] [timer.py:259:stop] epoch=0/micro_step=12510/global_step=12510, RunningAvgSamplesPerSec=2.631438466042101, CurrSamplesPerSec=2.6391959352773195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:32:50,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=12520, skipped=0, lr=[9.627974495833465e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:32:50,232] [INFO] [timer.py:259:stop] epoch=0/micro_step=12520/global_step=12520, RunningAvgSamplesPerSec=2.6314502913056543, CurrSamplesPerSec=2.65607766214871, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:33:05,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=12530, skipped=0, lr=[9.62733648669849e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:33:05,329] [INFO] [timer.py:259:stop] epoch=0/micro_step=12530/global_step=12530, RunningAvgSamplesPerSec=2.631474137490304, CurrSamplesPerSec=2.671800860287234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:33:20,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=12540, skipped=0, lr=[9.62669795212796e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:33:20,542] [INFO] [timer.py:259:stop] epoch=0/micro_step=12540/global_step=12540, RunningAvgSamplesPerSec=2.631482124452866, CurrSamplesPerSec=2.659143668042859, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:33:35,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=12550, skipped=0, lr=[9.626058892194383e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:33:35,618] [INFO] [timer.py:259:stop] epoch=0/micro_step=12550/global_step=12550, RunningAvgSamplesPerSec=2.6315082837251578, CurrSamplesPerSec=2.6771860434200656, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:33:50,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=12560, skipped=0, lr=[9.62541930697033e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:33:50,832] [INFO] [timer.py:259:stop] epoch=0/micro_step=12560/global_step=12560, RunningAvgSamplesPerSec=2.631515417897736, CurrSamplesPerSec=2.631633387553211, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:34:06,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=12570, skipped=0, lr=[9.624779196528419e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:34:06,005] [INFO] [timer.py:259:stop] epoch=0/micro_step=12570/global_step=12570, RunningAvgSamplesPerSec=2.6315282801183946, CurrSamplesPerSec=2.6529797028415367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:34:21,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=12580, skipped=0, lr=[9.624138560941339e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:34:21,211] [INFO] [timer.py:259:stop] epoch=0/micro_step=12580/global_step=12580, RunningAvgSamplesPerSec=2.6315362840870864, CurrSamplesPerSec=2.6486736649806506, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:34:36,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=12590, skipped=0, lr=[9.623497400281832e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:34:36,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=12590/global_step=12590, RunningAvgSamplesPerSec=2.6315526908682423, CurrSamplesPerSec=2.6594728746213083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:34:51,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=12600, skipped=0, lr=[9.622855714622705e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:34:51,589] [INFO] [timer.py:259:stop] epoch=0/micro_step=12600/global_step=12600, RunningAvgSamplesPerSec=2.6315596656775315, CurrSamplesPerSec=2.655865328755043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:35:06,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=12610, skipped=0, lr=[9.622213504036818e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:35:06,794] [INFO] [timer.py:259:stop] epoch=0/micro_step=12610/global_step=12610, RunningAvgSamplesPerSec=2.6315687026452146, CurrSamplesPerSec=2.644723820277046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:35:21,987] [INFO] [logging.py:96:log_dist] [Rank 0] step=12620, skipped=0, lr=[9.621570768597097e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:35:21,999] [INFO] [timer.py:259:stop] epoch=0/micro_step=12620/global_step=12620, RunningAvgSamplesPerSec=2.631578243428691, CurrSamplesPerSec=2.6390713908984234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:35:37,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=12630, skipped=0, lr=[9.620927508376524e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:35:37,133] [INFO] [timer.py:259:stop] epoch=0/micro_step=12630/global_step=12630, RunningAvgSamplesPerSec=2.631596348743109, CurrSamplesPerSec=2.6555496244442423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:35:52,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=12640, skipped=0, lr=[9.62028372344814e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:35:52,343] [INFO] [timer.py:259:stop] epoch=0/micro_step=12640/global_step=12640, RunningAvgSamplesPerSec=2.6316040758073784, CurrSamplesPerSec=2.649251680925965, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:36:07,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=12650, skipped=0, lr=[9.619639413885049e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:36:07,491] [INFO] [timer.py:259:stop] epoch=0/micro_step=12650/global_step=12650, RunningAvgSamplesPerSec=2.631619511834171, CurrSamplesPerSec=2.6580066146772747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:36:22,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=12660, skipped=0, lr=[9.618994579760412e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:36:22,660] [INFO] [timer.py:259:stop] epoch=0/micro_step=12660/global_step=12660, RunningAvgSamplesPerSec=2.6316313282763857, CurrSamplesPerSec=2.63020920201444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:36:37,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=12670, skipped=0, lr=[9.61834922114745e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:36:37,829] [INFO] [timer.py:259:stop] epoch=0/micro_step=12670/global_step=12670, RunningAvgSamplesPerSec=2.631644195759838, CurrSamplesPerSec=2.6192356699615034, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:36:52,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=12680, skipped=0, lr=[9.617703338119441e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:36:52,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=12680/global_step=12680, RunningAvgSamplesPerSec=2.6316659335502393, CurrSamplesPerSec=2.665669625825908, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:37:08,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=12690, skipped=0, lr=[9.617056930749731e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:37:08,099] [INFO] [timer.py:259:stop] epoch=0/micro_step=12690/global_step=12690, RunningAvgSamplesPerSec=2.6316781162554563, CurrSamplesPerSec=2.6264811224644635, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:37:23,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=12700, skipped=0, lr=[9.616409999111716e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:37:23,268] [INFO] [timer.py:259:stop] epoch=0/micro_step=12700/global_step=12700, RunningAvgSamplesPerSec=2.631692816074502, CurrSamplesPerSec=2.679491949068426, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:37:38,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=12710, skipped=0, lr=[9.615762543278853e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:37:38,462] [INFO] [timer.py:259:stop] epoch=0/micro_step=12710/global_step=12710, RunningAvgSamplesPerSec=2.6317038752750563, CurrSamplesPerSec=2.6185889318001427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:37:53,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=12720, skipped=0, lr=[9.615114563324666e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:37:53,579] [INFO] [timer.py:259:stop] epoch=0/micro_step=12720/global_step=12720, RunningAvgSamplesPerSec=2.631725167924414, CurrSamplesPerSec=2.657354901236321, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:38:08,791] [INFO] [logging.py:96:log_dist] [Rank 0] step=12730, skipped=0, lr=[9.614466059322731e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:38:08,793] [INFO] [timer.py:259:stop] epoch=0/micro_step=12730/global_step=12730, RunningAvgSamplesPerSec=2.6317333957067603, CurrSamplesPerSec=2.6021591415218692, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:38:23,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=12740, skipped=0, lr=[9.613817031346686e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:38:23,969] [INFO] [timer.py:259:stop] epoch=0/micro_step=12740/global_step=12740, RunningAvgSamplesPerSec=2.6317471721766004, CurrSamplesPerSec=2.633550135747771, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:38:39,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=12750, skipped=0, lr=[9.613167479470228e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:38:39,115] [INFO] [timer.py:259:stop] epoch=0/micro_step=12750/global_step=12750, RunningAvgSamplesPerSec=2.6317629043529656, CurrSamplesPerSec=2.6340516771462528, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:38:54,251] [INFO] [logging.py:96:log_dist] [Rank 0] step=12760, skipped=0, lr=[9.612517403767112e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:38:54,265] [INFO] [timer.py:259:stop] epoch=0/micro_step=12760/global_step=12760, RunningAvgSamplesPerSec=2.6317789534105454, CurrSamplesPerSec=2.662127205860387, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:39:09,408] [INFO] [logging.py:96:log_dist] [Rank 0] step=12770, skipped=0, lr=[9.611866804311159e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:39:09,410] [INFO] [timer.py:259:stop] epoch=0/micro_step=12770/global_step=12770, RunningAvgSamplesPerSec=2.631795075593363, CurrSamplesPerSec=2.6443148962681295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:39:24,578] [INFO] [logging.py:96:log_dist] [Rank 0] step=12780, skipped=0, lr=[9.611215681176242e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:39:24,579] [INFO] [timer.py:259:stop] epoch=0/micro_step=12780/global_step=12780, RunningAvgSamplesPerSec=2.631809337575705, CurrSamplesPerSec=2.674883676700432, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:39:39,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=12790, skipped=0, lr=[9.610564034436295e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:39:39,738] [INFO] [timer.py:259:stop] epoch=0/micro_step=12790/global_step=12790, RunningAvgSamplesPerSec=2.6318247266630848, CurrSamplesPerSec=2.6563741447653664, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:39:54,868] [INFO] [logging.py:96:log_dist] [Rank 0] step=12800, skipped=0, lr=[9.609911864165313e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:39:54,870] [INFO] [timer.py:259:stop] epoch=0/micro_step=12800/global_step=12800, RunningAvgSamplesPerSec=2.6318435429048046, CurrSamplesPerSec=2.6585221493295568, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:40:10,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=12810, skipped=0, lr=[9.609259170437353e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:40:10,076] [INFO] [timer.py:259:stop] epoch=0/micro_step=12810/global_step=12810, RunningAvgSamplesPerSec=2.631852158152157, CurrSamplesPerSec=2.6396763715418823, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:40:25,197] [INFO] [logging.py:96:log_dist] [Rank 0] step=12820, skipped=0, lr=[9.608605953326526e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:40:25,198] [INFO] [timer.py:259:stop] epoch=0/micro_step=12820/global_step=12820, RunningAvgSamplesPerSec=2.631873174852759, CurrSamplesPerSec=2.6771275174957165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:40:40,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=12830, skipped=0, lr=[9.607952212907005e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:40:40,366] [INFO] [timer.py:259:stop] epoch=0/micro_step=12830/global_step=12830, RunningAvgSamplesPerSec=2.63188631271481, CurrSamplesPerSec=2.6495428756580495, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:40:55,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=12840, skipped=0, lr=[9.607297949253024e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:40:55,481] [INFO] [timer.py:259:stop] epoch=0/micro_step=12840/global_step=12840, RunningAvgSamplesPerSec=2.6319058470581225, CurrSamplesPerSec=2.672966783055909, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:41:10,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=12850, skipped=0, lr=[9.606643162438876e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:41:10,624] [INFO] [timer.py:259:stop] epoch=0/micro_step=12850/global_step=12850, RunningAvgSamplesPerSec=2.6319219483101532, CurrSamplesPerSec=2.6698497174530247, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:41:25,792] [INFO] [logging.py:96:log_dist] [Rank 0] step=12860, skipped=0, lr=[9.605987852538907e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:41:25,793] [INFO] [timer.py:259:stop] epoch=0/micro_step=12860/global_step=12860, RunningAvgSamplesPerSec=2.6319351898023235, CurrSamplesPerSec=2.6560881745848453, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:41:40,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=12870, skipped=0, lr=[9.605332019627534e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:41:40,965] [INFO] [timer.py:259:stop] epoch=0/micro_step=12870/global_step=12870, RunningAvgSamplesPerSec=2.631948382225672, CurrSamplesPerSec=2.66834907886584, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:41:56,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=12880, skipped=0, lr=[9.604675663779222e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:41:56,138] [INFO] [timer.py:259:stop] epoch=0/micro_step=12880/global_step=12880, RunningAvgSamplesPerSec=2.631960999363477, CurrSamplesPerSec=2.6446166791311736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:42:11,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=12890, skipped=0, lr=[9.604018785068502e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:42:11,275] [INFO] [timer.py:259:stop] epoch=0/micro_step=12890/global_step=12890, RunningAvgSamplesPerSec=2.631977627068486, CurrSamplesPerSec=2.6444432706149645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:42:26,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=12900, skipped=0, lr=[9.603361383569963e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:42:26,436] [INFO] [timer.py:259:stop] epoch=0/micro_step=12900/global_step=12900, RunningAvgSamplesPerSec=2.6319934020457563, CurrSamplesPerSec=2.653688033989579, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:42:41,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=12910, skipped=0, lr=[9.602703459358255e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:42:41,643] [INFO] [timer.py:259:stop] epoch=0/micro_step=12910/global_step=12910, RunningAvgSamplesPerSec=2.6320032861917904, CurrSamplesPerSec=2.6454127325366645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:42:56,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=12920, skipped=0, lr=[9.602045012508083e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:42:56,805] [INFO] [timer.py:259:stop] epoch=0/micro_step=12920/global_step=12920, RunningAvgSamplesPerSec=2.6320178052698826, CurrSamplesPerSec=2.6368577093735106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:43:11,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=12930, skipped=0, lr=[9.601386043094213e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:43:11,972] [INFO] [timer.py:259:stop] epoch=0/micro_step=12930/global_step=12930, RunningAvgSamplesPerSec=2.6320356432972964, CurrSamplesPerSec=2.651376445236347, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:43:27,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=12940, skipped=0, lr=[9.600726551191477e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:43:27,129] [INFO] [timer.py:259:stop] epoch=0/micro_step=12940/global_step=12940, RunningAvgSamplesPerSec=2.6320510278689904, CurrSamplesPerSec=2.6778252960003335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:43:42,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=12950, skipped=0, lr=[9.600066536874752e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:43:42,299] [INFO] [timer.py:259:stop] epoch=0/micro_step=12950/global_step=12950, RunningAvgSamplesPerSec=2.632064459853866, CurrSamplesPerSec=2.651756540478653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:43:57,437] [INFO] [logging.py:96:log_dist] [Rank 0] step=12960, skipped=0, lr=[9.59940600021899e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:43:57,458] [INFO] [timer.py:259:stop] epoch=0/micro_step=12960/global_step=12960, RunningAvgSamplesPerSec=2.6320777434896168, CurrSamplesPerSec=2.6573165998620367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:44:12,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=12970, skipped=0, lr=[9.598744941299192e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:44:12,586] [INFO] [timer.py:259:stop] epoch=0/micro_step=12970/global_step=12970, RunningAvgSamplesPerSec=2.632095377243998, CurrSamplesPerSec=2.641324138086431, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:44:27,743] [INFO] [logging.py:96:log_dist] [Rank 0] step=12980, skipped=0, lr=[9.598083360190421e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:44:27,744] [INFO] [timer.py:259:stop] epoch=0/micro_step=12980/global_step=12980, RunningAvgSamplesPerSec=2.632109551120628, CurrSamplesPerSec=2.6703953590967124, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:44:42,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=12990, skipped=0, lr=[9.5974212569678e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:44:42,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=12990/global_step=12990, RunningAvgSamplesPerSec=2.632120199650574, CurrSamplesPerSec=2.6677274919883143, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:44:58,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=13000, skipped=0, lr=[9.596758631706512e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:44:58,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=13000/global_step=13000, RunningAvgSamplesPerSec=2.6321339010034617, CurrSamplesPerSec=2.6269762727397996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:45:13,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=13010, skipped=0, lr=[9.5960954844818e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:45:13,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=13010/global_step=13010, RunningAvgSamplesPerSec=2.632156884779982, CurrSamplesPerSec=2.6704778196745456, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:45:28,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=13020, skipped=0, lr=[9.595431815368961e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:45:28,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=13020/global_step=13020, RunningAvgSamplesPerSec=2.63217243303756, CurrSamplesPerSec=2.624018428443626, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:45:43,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=13030, skipped=0, lr=[9.594767624443356e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:45:43,483] [INFO] [timer.py:259:stop] epoch=0/micro_step=13030/global_step=13030, RunningAvgSamplesPerSec=2.632187527241494, CurrSamplesPerSec=2.652908806625278, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:45:58,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=13040, skipped=0, lr=[9.594102911780406e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:45:58,633] [INFO] [timer.py:259:stop] epoch=0/micro_step=13040/global_step=13040, RunningAvgSamplesPerSec=2.632203148077466, CurrSamplesPerSec=2.6248797431846027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:46:13,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=13050, skipped=0, lr=[9.593437677455586e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:46:13,791] [INFO] [timer.py:259:stop] epoch=0/micro_step=13050/global_step=13050, RunningAvgSamplesPerSec=2.6322164686653076, CurrSamplesPerSec=2.656049489230205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:46:28,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=13060, skipped=0, lr=[9.592771921544436e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:46:28,961] [INFO] [timer.py:259:stop] epoch=0/micro_step=13060/global_step=13060, RunningAvgSamplesPerSec=2.6322287338191477, CurrSamplesPerSec=2.6650141496625355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:46:43,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=13070, skipped=0, lr=[9.592105644122552e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:46:43,992] [INFO] [timer.py:259:stop] epoch=0/micro_step=13070/global_step=13070, RunningAvgSamplesPerSec=2.632258940331406, CurrSamplesPerSec=2.678040300444593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:46:59,146] [INFO] [logging.py:96:log_dist] [Rank 0] step=13080, skipped=0, lr=[9.591438845265591e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:46:59,147] [INFO] [timer.py:259:stop] epoch=0/micro_step=13080/global_step=13080, RunningAvgSamplesPerSec=2.6322728229097487, CurrSamplesPerSec=2.640579583314878, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:47:14,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=13090, skipped=0, lr=[9.590771525049266e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:47:14,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=13090/global_step=13090, RunningAvgSamplesPerSec=2.632284857996792, CurrSamplesPerSec=2.6462209519315234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:47:29,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=13100, skipped=0, lr=[9.590103683549354e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:47:29,474] [INFO] [timer.py:259:stop] epoch=0/micro_step=13100/global_step=13100, RunningAvgSamplesPerSec=2.6322969786814703, CurrSamplesPerSec=2.6408248118300217, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:47:44,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=13110, skipped=0, lr=[9.589435320841687e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:47:44,663] [INFO] [timer.py:259:stop] epoch=0/micro_step=13110/global_step=13110, RunningAvgSamplesPerSec=2.632307254638299, CurrSamplesPerSec=2.6435624065177388, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:47:59,840] [INFO] [logging.py:96:log_dist] [Rank 0] step=13120, skipped=0, lr=[9.588766437002157e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:47:59,841] [INFO] [timer.py:259:stop] epoch=0/micro_step=13120/global_step=13120, RunningAvgSamplesPerSec=2.6323204705975374, CurrSamplesPerSec=2.659260841067416, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:48:14,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=13130, skipped=0, lr=[9.588097032106719e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:48:14,978] [INFO] [timer.py:259:stop] epoch=0/micro_step=13130/global_step=13130, RunningAvgSamplesPerSec=2.6323362134923913, CurrSamplesPerSec=2.655405879842412, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:48:30,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=13140, skipped=0, lr=[9.587427106231382e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:48:30,128] [INFO] [timer.py:259:stop] epoch=0/micro_step=13140/global_step=13140, RunningAvgSamplesPerSec=2.6323509650158416, CurrSamplesPerSec=2.650429405282839, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:48:45,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=13150, skipped=0, lr=[9.586756659452216e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:48:45,255] [INFO] [timer.py:259:stop] epoch=0/micro_step=13150/global_step=13150, RunningAvgSamplesPerSec=2.6323671936831032, CurrSamplesPerSec=2.6537870962084016, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:49:00,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=13160, skipped=0, lr=[9.586085691845353e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:49:00,441] [INFO] [timer.py:259:stop] epoch=0/micro_step=13160/global_step=13160, RunningAvgSamplesPerSec=2.6323774647221234, CurrSamplesPerSec=2.672949322894768, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:49:15,605] [INFO] [logging.py:96:log_dist] [Rank 0] step=13170, skipped=0, lr=[9.585414203486978e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:49:15,606] [INFO] [timer.py:259:stop] epoch=0/micro_step=13170/global_step=13170, RunningAvgSamplesPerSec=2.6323897102937983, CurrSamplesPerSec=2.640820655035051, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:49:30,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=13180, skipped=0, lr=[9.58474219445334e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:49:30,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=13180/global_step=13180, RunningAvgSamplesPerSec=2.632390344538889, CurrSamplesPerSec=2.6388837662294424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:49:46,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=13190, skipped=0, lr=[9.584069664820748e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:49:46,070] [INFO] [timer.py:259:stop] epoch=0/micro_step=13190/global_step=13190, RunningAvgSamplesPerSec=2.6324007200327486, CurrSamplesPerSec=2.636144664422882, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:50:01,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=13200, skipped=0, lr=[9.583396614665565e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:50:01,260] [INFO] [timer.py:259:stop] epoch=0/micro_step=13200/global_step=13200, RunningAvgSamplesPerSec=2.632410280384081, CurrSamplesPerSec=2.645255902558992, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:50:16,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=13210, skipped=0, lr=[9.582723044064218e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:50:16,480] [INFO] [timer.py:259:stop] epoch=0/micro_step=13210/global_step=13210, RunningAvgSamplesPerSec=2.6324171940553813, CurrSamplesPerSec=2.645592108764004, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:50:31,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=13220, skipped=0, lr=[9.582048953093189e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:50:31,659] [INFO] [timer.py:259:stop] epoch=0/micro_step=13220/global_step=13220, RunningAvgSamplesPerSec=2.6324288308931356, CurrSamplesPerSec=2.6544866187592016, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:50:46,820] [INFO] [logging.py:96:log_dist] [Rank 0] step=13230, skipped=0, lr=[9.581374341829024e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:50:46,821] [INFO] [timer.py:259:stop] epoch=0/micro_step=13230/global_step=13230, RunningAvgSamplesPerSec=2.6324443283015233, CurrSamplesPerSec=2.6564397583731925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:51:02,088] [INFO] [logging.py:96:log_dist] [Rank 0] step=13240, skipped=0, lr=[9.580699210348324e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:51:02,090] [INFO] [timer.py:259:stop] epoch=0/micro_step=13240/global_step=13240, RunningAvgSamplesPerSec=2.6324491316858167, CurrSamplesPerSec=2.6505738678580055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:51:17,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=13250, skipped=0, lr=[9.58002355872775e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:51:17,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=13250/global_step=13250, RunningAvgSamplesPerSec=2.632458896188675, CurrSamplesPerSec=2.6403011586958343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:51:32,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=13260, skipped=0, lr=[9.579347387044023e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:51:32,428] [INFO] [timer.py:259:stop] epoch=0/micro_step=13260/global_step=13260, RunningAvgSamplesPerSec=2.632473341887672, CurrSamplesPerSec=2.6674301661550035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:51:47,628] [INFO] [logging.py:96:log_dist] [Rank 0] step=13270, skipped=0, lr=[9.578670695373922e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:51:47,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=13270/global_step=13270, RunningAvgSamplesPerSec=2.632483992088736, CurrSamplesPerSec=2.6537694659505013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:52:02,772] [INFO] [logging.py:96:log_dist] [Rank 0] step=13280, skipped=0, lr=[9.577993483794285e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:52:02,789] [INFO] [timer.py:259:stop] epoch=0/micro_step=13280/global_step=13280, RunningAvgSamplesPerSec=2.6324962205022717, CurrSamplesPerSec=2.633169868957585, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:52:17,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=13290, skipped=0, lr=[9.577315752382011e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:52:17,955] [INFO] [timer.py:259:stop] epoch=0/micro_step=13290/global_step=13290, RunningAvgSamplesPerSec=2.6325072112896986, CurrSamplesPerSec=2.602684327050695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:52:33,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=13300, skipped=0, lr=[9.576637501214057e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:52:33,123] [INFO] [timer.py:259:stop] epoch=0/micro_step=13300/global_step=13300, RunningAvgSamplesPerSec=2.632518965804921, CurrSamplesPerSec=2.6419517857720987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:52:48,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=13310, skipped=0, lr=[9.575958730367438e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:52:48,316] [INFO] [timer.py:259:stop] epoch=0/micro_step=13310/global_step=13310, RunningAvgSamplesPerSec=2.632527223209277, CurrSamplesPerSec=2.6374952602135595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:53:03,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=13320, skipped=0, lr=[9.575279439919228e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:53:03,514] [INFO] [timer.py:259:stop] epoch=0/micro_step=13320/global_step=13320, RunningAvgSamplesPerSec=2.632536032016779, CurrSamplesPerSec=2.656147886800798, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:53:18,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=13330, skipped=0, lr=[9.574599629946562e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:53:18,757] [INFO] [timer.py:259:stop] epoch=0/micro_step=13330/global_step=13330, RunningAvgSamplesPerSec=2.6325396982040443, CurrSamplesPerSec=2.5916841476957377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:53:33,933] [INFO] [logging.py:96:log_dist] [Rank 0] step=13340, skipped=0, lr=[9.573919300526631e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:53:33,942] [INFO] [timer.py:259:stop] epoch=0/micro_step=13340/global_step=13340, RunningAvgSamplesPerSec=2.632550560644259, CurrSamplesPerSec=2.6529289424434395, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:53:49,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=13350, skipped=0, lr=[9.573238451736686e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:53:49,204] [INFO] [timer.py:259:stop] epoch=0/micro_step=13350/global_step=13350, RunningAvgSamplesPerSec=2.632549062996842, CurrSamplesPerSec=2.6175708236568322, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:54:04,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=13360, skipped=0, lr=[9.572557083654042e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:54:04,395] [INFO] [timer.py:259:stop] epoch=0/micro_step=13360/global_step=13360, RunningAvgSamplesPerSec=2.6325598088185775, CurrSamplesPerSec=2.650177366265847, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:54:19,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=13370, skipped=0, lr=[9.571875196356063e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:54:19,544] [INFO] [timer.py:259:stop] epoch=0/micro_step=13370/global_step=13370, RunningAvgSamplesPerSec=2.632574524433138, CurrSamplesPerSec=2.638154276091096, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:54:34,764] [INFO] [logging.py:96:log_dist] [Rank 0] step=13380, skipped=0, lr=[9.571192789920181e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:54:34,767] [INFO] [timer.py:259:stop] epoch=0/micro_step=13380/global_step=13380, RunningAvgSamplesPerSec=2.6325798166305616, CurrSamplesPerSec=2.637302055560367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:54:49,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=13390, skipped=0, lr=[9.570509864423883e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:54:49,983] [INFO] [timer.py:259:stop] epoch=0/micro_step=13390/global_step=13390, RunningAvgSamplesPerSec=2.6325856996898285, CurrSamplesPerSec=2.6529591467438105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:55:05,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=13400, skipped=0, lr=[9.569826419944714e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:55:05,224] [INFO] [timer.py:259:stop] epoch=0/micro_step=13400/global_step=13400, RunningAvgSamplesPerSec=2.6325909509943326, CurrSamplesPerSec=2.643041414644105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:55:20,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=13410, skipped=0, lr=[9.56914245656028e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:55:20,366] [INFO] [timer.py:259:stop] epoch=0/micro_step=13410/global_step=13410, RunningAvgSamplesPerSec=2.6326057717767184, CurrSamplesPerSec=2.6479287228310424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:55:35,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=13420, skipped=0, lr=[9.568457974348246e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:55:35,578] [INFO] [timer.py:259:stop] epoch=0/micro_step=13420/global_step=13420, RunningAvgSamplesPerSec=2.6326115413310682, CurrSamplesPerSec=2.6363782984933932, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:55:50,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=13430, skipped=0, lr=[9.567772973386334e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:55:50,788] [INFO] [timer.py:259:stop] epoch=0/micro_step=13430/global_step=13430, RunningAvgSamplesPerSec=2.632620749131765, CurrSamplesPerSec=2.677729559604204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:56:06,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=13440, skipped=0, lr=[9.567087453752329e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:56:06,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=13440/global_step=13440, RunningAvgSamplesPerSec=2.632626284228783, CurrSamplesPerSec=2.6394371696552166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:56:21,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=13450, skipped=0, lr=[9.566401415524067e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:56:21,232] [INFO] [timer.py:259:stop] epoch=0/micro_step=13450/global_step=13450, RunningAvgSamplesPerSec=2.6326314985852846, CurrSamplesPerSec=2.6369480586892893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:56:36,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=13460, skipped=0, lr=[9.565714858779452e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:56:36,435] [INFO] [timer.py:259:stop] epoch=0/micro_step=13460/global_step=13460, RunningAvgSamplesPerSec=2.6326395576092647, CurrSamplesPerSec=2.6621998629523267, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:56:51,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=13470, skipped=0, lr=[9.56502778359644e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:56:51,652] [INFO] [timer.py:259:stop] epoch=0/micro_step=13470/global_step=13470, RunningAvgSamplesPerSec=2.6326467473311523, CurrSamplesPerSec=2.639206314506176, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:57:06,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=13480, skipped=0, lr=[9.564340190053053e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:57:06,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=13480/global_step=13480, RunningAvgSamplesPerSec=2.632655936659426, CurrSamplesPerSec=2.6457911196916437, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:57:22,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=13490, skipped=0, lr=[9.563652078227364e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:57:22,066] [INFO] [timer.py:259:stop] epoch=0/micro_step=13490/global_step=13490, RunningAvgSamplesPerSec=2.6326646873347976, CurrSamplesPerSec=2.6636068972593994, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:57:37,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=13500, skipped=0, lr=[9.562963448197506e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:57:37,262] [INFO] [timer.py:259:stop] epoch=0/micro_step=13500/global_step=13500, RunningAvgSamplesPerSec=2.6326744999195375, CurrSamplesPerSec=2.6461846403132134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:57:52,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=13510, skipped=0, lr=[9.562274300041678e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:57:52,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=13510/global_step=13510, RunningAvgSamplesPerSec=2.6326847687588284, CurrSamplesPerSec=2.6508850392938745, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:58:07,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=13520, skipped=0, lr=[9.56158463383813e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:58:07,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=13520/global_step=13520, RunningAvgSamplesPerSec=2.632695905517494, CurrSamplesPerSec=2.6578217616638575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:58:22,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=13530, skipped=0, lr=[9.560894449665174e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:58:22,784] [INFO] [timer.py:259:stop] epoch=0/micro_step=13530/global_step=13530, RunningAvgSamplesPerSec=2.6327096256431366, CurrSamplesPerSec=2.6249429887546927, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:58:37,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=13540, skipped=0, lr=[9.560203747601185e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:58:37,973] [INFO] [timer.py:259:stop] epoch=0/micro_step=13540/global_step=13540, RunningAvgSamplesPerSec=2.6327191934886685, CurrSamplesPerSec=2.6521392602115195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:58:53,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=13550, skipped=0, lr=[9.559512527724584e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:58:53,160] [INFO] [timer.py:259:stop] epoch=0/micro_step=13550/global_step=13550, RunningAvgSamplesPerSec=2.632729352284155, CurrSamplesPerSec=2.6360551985106064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:59:08,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=13560, skipped=0, lr=[9.558820790113867e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:59:08,355] [INFO] [timer.py:259:stop] epoch=0/micro_step=13560/global_step=13560, RunningAvgSamplesPerSec=2.6327378373411965, CurrSamplesPerSec=2.641100852301522, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:59:23,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=13570, skipped=0, lr=[9.558128534847578e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:59:23,525] [INFO] [timer.py:259:stop] epoch=0/micro_step=13570/global_step=13570, RunningAvgSamplesPerSec=2.6327495345367127, CurrSamplesPerSec=2.650855719890474, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:59:38,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=13580, skipped=0, lr=[9.55743576200432e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:59:38,773] [INFO] [timer.py:259:stop] epoch=0/micro_step=13580/global_step=13580, RunningAvgSamplesPerSec=2.6327539585641544, CurrSamplesPerSec=2.591882738530918, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 05:59:53,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=13590, skipped=0, lr=[9.556742471662766e-06], mom=[(0.9, 0.95)] +[2024-11-01 05:59:53,984] [INFO] [timer.py:259:stop] epoch=0/micro_step=13590/global_step=13590, RunningAvgSamplesPerSec=2.6327603621580153, CurrSamplesPerSec=2.6426925357242914, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:00:09,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=13600, skipped=0, lr=[9.55604866390163e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:00:09,197] [INFO] [timer.py:259:stop] epoch=0/micro_step=13600/global_step=13600, RunningAvgSamplesPerSec=2.63276645833827, CurrSamplesPerSec=2.6433004272988523, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:00:24,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=13610, skipped=0, lr=[9.555354338799698e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:00:24,409] [INFO] [timer.py:259:stop] epoch=0/micro_step=13610/global_step=13610, RunningAvgSamplesPerSec=2.632774766690198, CurrSamplesPerSec=2.637787608621198, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:00:39,594] [INFO] [logging.py:96:log_dist] [Rank 0] step=13620, skipped=0, lr=[9.554659496435812e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:00:39,596] [INFO] [timer.py:259:stop] epoch=0/micro_step=13620/global_step=13620, RunningAvgSamplesPerSec=2.6327825010805617, CurrSamplesPerSec=2.64457124056639, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:00:54,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=13630, skipped=0, lr=[9.55396413688887e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:00:54,816] [INFO] [timer.py:259:stop] epoch=0/micro_step=13630/global_step=13630, RunningAvgSamplesPerSec=2.63278782485137, CurrSamplesPerSec=2.6129881289108394, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:01:10,039] [INFO] [logging.py:96:log_dist] [Rank 0] step=13640, skipped=0, lr=[9.553268260237828e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:01:10,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=13640/global_step=13640, RunningAvgSamplesPerSec=2.6327939892626917, CurrSamplesPerSec=2.6240553655175285, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:01:25,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=13650, skipped=0, lr=[9.552571866561708e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:01:25,226] [INFO] [timer.py:259:stop] epoch=0/micro_step=13650/global_step=13650, RunningAvgSamplesPerSec=2.6328031399453034, CurrSamplesPerSec=2.6580567273271476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:01:40,451] [INFO] [logging.py:96:log_dist] [Rank 0] step=13660, skipped=0, lr=[9.551874955939585e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:01:40,452] [INFO] [timer.py:259:stop] epoch=0/micro_step=13660/global_step=13660, RunningAvgSamplesPerSec=2.6328076158164095, CurrSamplesPerSec=2.6343651858332278, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:01:55,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=13670, skipped=0, lr=[9.55117752845059e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:01:55,591] [INFO] [timer.py:259:stop] epoch=0/micro_step=13670/global_step=13670, RunningAvgSamplesPerSec=2.632822756358454, CurrSamplesPerSec=2.632312605038331, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:02:10,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=13680, skipped=0, lr=[9.550479584173918e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:02:10,784] [INFO] [timer.py:259:stop] epoch=0/micro_step=13680/global_step=13680, RunningAvgSamplesPerSec=2.6328314480469537, CurrSamplesPerSec=2.648862266191287, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:02:25,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=13690, skipped=0, lr=[9.549781123188819e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:02:25,900] [INFO] [timer.py:259:stop] epoch=0/micro_step=13690/global_step=13690, RunningAvgSamplesPerSec=2.6328491776300904, CurrSamplesPerSec=2.6743515462824563, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:02:41,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=13700, skipped=0, lr=[9.549082145574608e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:02:41,050] [INFO] [timer.py:259:stop] epoch=0/micro_step=13700/global_step=13700, RunningAvgSamplesPerSec=2.6328627250809857, CurrSamplesPerSec=2.6571861309396274, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:02:56,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=13710, skipped=0, lr=[9.54838265141065e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:02:56,196] [INFO] [timer.py:259:stop] epoch=0/micro_step=13710/global_step=13710, RunningAvgSamplesPerSec=2.632879013701662, CurrSamplesPerSec=2.648118471902002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:03:11,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=13720, skipped=0, lr=[9.547682640776375e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:03:11,403] [INFO] [timer.py:259:stop] epoch=0/micro_step=13720/global_step=13720, RunningAvgSamplesPerSec=2.6328875036851325, CurrSamplesPerSec=2.6433712273726027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:03:26,578] [INFO] [logging.py:96:log_dist] [Rank 0] step=13730, skipped=0, lr=[9.546982113751267e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:03:26,579] [INFO] [timer.py:259:stop] epoch=0/micro_step=13730/global_step=13730, RunningAvgSamplesPerSec=2.632897960010228, CurrSamplesPerSec=2.6389194626732873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:03:41,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=13740, skipped=0, lr=[9.546281070414874e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:03:41,797] [INFO] [timer.py:259:stop] epoch=0/micro_step=13740/global_step=13740, RunningAvgSamplesPerSec=2.6329024639467273, CurrSamplesPerSec=2.632649246944525, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:03:56,997] [INFO] [logging.py:96:log_dist] [Rank 0] step=13750, skipped=0, lr=[9.545579510846797e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:03:57,000] [INFO] [timer.py:259:stop] epoch=0/micro_step=13750/global_step=13750, RunningAvgSamplesPerSec=2.632909421321481, CurrSamplesPerSec=2.6638868753210136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:04:12,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=13760, skipped=0, lr=[9.544877435126701e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:04:12,232] [INFO] [timer.py:259:stop] epoch=0/micro_step=13760/global_step=13760, RunningAvgSamplesPerSec=2.632913809838662, CurrSamplesPerSec=2.6347735207567555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:04:27,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=13770, skipped=0, lr=[9.544174843334305e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:04:27,468] [INFO] [timer.py:259:stop] epoch=0/micro_step=13770/global_step=13770, RunningAvgSamplesPerSec=2.6329185728082014, CurrSamplesPerSec=2.640508101541824, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:04:42,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=13780, skipped=0, lr=[9.54347173554939e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:04:42,690] [INFO] [timer.py:259:stop] epoch=0/micro_step=13780/global_step=13780, RunningAvgSamplesPerSec=2.632923925433871, CurrSamplesPerSec=2.6403726292647858, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:04:57,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=13790, skipped=0, lr=[9.542768111851792e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:04:57,916] [INFO] [timer.py:259:stop] epoch=0/micro_step=13790/global_step=13790, RunningAvgSamplesPerSec=2.632931579590346, CurrSamplesPerSec=2.6240044747085527, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:05:13,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=13800, skipped=0, lr=[9.542063972321407e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:05:13,124] [INFO] [timer.py:259:stop] epoch=0/micro_step=13800/global_step=13800, RunningAvgSamplesPerSec=2.632937489216436, CurrSamplesPerSec=2.6382787339211093, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:05:28,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=13810, skipped=0, lr=[9.541359317038194e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:05:28,323] [INFO] [timer.py:259:stop] epoch=0/micro_step=13810/global_step=13810, RunningAvgSamplesPerSec=2.632945241972178, CurrSamplesPerSec=2.621602951341671, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:05:43,501] [INFO] [logging.py:96:log_dist] [Rank 0] step=13820, skipped=0, lr=[9.540654146082166e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:05:43,502] [INFO] [timer.py:259:stop] epoch=0/micro_step=13820/global_step=13820, RunningAvgSamplesPerSec=2.6329569428332036, CurrSamplesPerSec=2.63492083371932, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:05:58,750] [INFO] [logging.py:96:log_dist] [Rank 0] step=13830, skipped=0, lr=[9.539948459533395e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:05:58,751] [INFO] [timer.py:259:stop] epoch=0/micro_step=13830/global_step=13830, RunningAvgSamplesPerSec=2.6329581140112865, CurrSamplesPerSec=2.5979125753667094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:06:13,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=13840, skipped=0, lr=[9.539242257472009e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:06:13,966] [INFO] [timer.py:259:stop] epoch=0/micro_step=13840/global_step=13840, RunningAvgSamplesPerSec=2.6329637061531725, CurrSamplesPerSec=2.646812930093872, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:06:29,149] [INFO] [logging.py:96:log_dist] [Rank 0] step=13850, skipped=0, lr=[9.5385355399782e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:06:29,150] [INFO] [timer.py:259:stop] epoch=0/micro_step=13850/global_step=13850, RunningAvgSamplesPerSec=2.632973111045552, CurrSamplesPerSec=2.647936663326259, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:06:44,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=13860, skipped=0, lr=[9.537828307132217e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:06:44,356] [INFO] [timer.py:259:stop] epoch=0/micro_step=13860/global_step=13860, RunningAvgSamplesPerSec=2.6329796169969097, CurrSamplesPerSec=2.6393130177122774, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:06:59,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=13870, skipped=0, lr=[9.537120559014366e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:06:59,533] [INFO] [timer.py:259:stop] epoch=0/micro_step=13870/global_step=13870, RunningAvgSamplesPerSec=2.632990307145942, CurrSamplesPerSec=2.672738541388237, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:07:14,709] [INFO] [logging.py:96:log_dist] [Rank 0] step=13880, skipped=0, lr=[9.53641229570501e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:07:14,712] [INFO] [timer.py:259:stop] epoch=0/micro_step=13880/global_step=13880, RunningAvgSamplesPerSec=2.6330021216532598, CurrSamplesPerSec=2.6390801086223337, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:07:29,972] [INFO] [logging.py:96:log_dist] [Rank 0] step=13890, skipped=0, lr=[9.535703517284573e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:07:29,974] [INFO] [timer.py:259:stop] epoch=0/micro_step=13890/global_step=13890, RunningAvgSamplesPerSec=2.633005089011261, CurrSamplesPerSec=2.6458107303552443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:07:45,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=13900, skipped=0, lr=[9.53499422383354e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:07:45,221] [INFO] [timer.py:259:stop] epoch=0/micro_step=13900/global_step=13900, RunningAvgSamplesPerSec=2.63300749051744, CurrSamplesPerSec=2.6478280080509267, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +Beginning of Epoch 2/3Beginning of Epoch 2/3 + +[2024-11-01 06:08:00,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=13910, skipped=0, lr=[9.534284415432446e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:08:00,408] [INFO] [timer.py:259:stop] epoch=0/micro_step=13910/global_step=13910, RunningAvgSamplesPerSec=2.6330184399949235, CurrSamplesPerSec=2.64206453607163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:08:15,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=13920, skipped=0, lr=[9.533574092161896e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:08:15,634] [INFO] [timer.py:259:stop] epoch=0/micro_step=13920/global_step=13920, RunningAvgSamplesPerSec=2.633024880505701, CurrSamplesPerSec=2.638711108692344, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:08:30,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=13930, skipped=0, lr=[9.532863254102546e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:08:30,883] [INFO] [timer.py:259:stop] epoch=0/micro_step=13930/global_step=13930, RunningAvgSamplesPerSec=2.633028225096377, CurrSamplesPerSec=2.642890694311213, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:08:46,120] [INFO] [logging.py:96:log_dist] [Rank 0] step=13940, skipped=0, lr=[9.532151901335109e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:08:46,122] [INFO] [timer.py:259:stop] epoch=0/micro_step=13940/global_step=13940, RunningAvgSamplesPerSec=2.633033011588812, CurrSamplesPerSec=2.6484512253480434, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:09:01,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=13950, skipped=0, lr=[9.53144003394036e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:09:01,370] [INFO] [timer.py:259:stop] epoch=0/micro_step=13950/global_step=13950, RunningAvgSamplesPerSec=2.633036655706035, CurrSamplesPerSec=2.641122056591641, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:09:16,550] [INFO] [logging.py:96:log_dist] [Rank 0] step=13960, skipped=0, lr=[9.530727651999134e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:09:16,560] [INFO] [timer.py:259:stop] epoch=0/micro_step=13960/global_step=13960, RunningAvgSamplesPerSec=2.6330461732641264, CurrSamplesPerSec=2.6505303180613473, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:09:31,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=13970, skipped=0, lr=[9.53001475559232e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:09:31,782] [INFO] [timer.py:259:stop] epoch=0/micro_step=13970/global_step=13970, RunningAvgSamplesPerSec=2.6330524090249607, CurrSamplesPerSec=2.658965399399524, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:09:47,007] [INFO] [logging.py:96:log_dist] [Rank 0] step=13980, skipped=0, lr=[9.529301344800872e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:09:47,009] [INFO] [timer.py:259:stop] epoch=0/micro_step=13980/global_step=13980, RunningAvgSamplesPerSec=2.633058793791782, CurrSamplesPerSec=2.644932290962317, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:10:02,210] [INFO] [logging.py:96:log_dist] [Rank 0] step=13990, skipped=0, lr=[9.528587419705791e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:10:02,218] [INFO] [timer.py:259:stop] epoch=0/micro_step=13990/global_step=13990, RunningAvgSamplesPerSec=2.633065187392898, CurrSamplesPerSec=2.644838892031711, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:10:17,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=14000, skipped=0, lr=[9.527872980388148e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:10:17,363] [INFO] [timer.py:259:stop] epoch=0/micro_step=14000/global_step=14000, RunningAvgSamplesPerSec=2.633080804881788, CurrSamplesPerSec=2.6611209751492133, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:10:32,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=14010, skipped=0, lr=[9.527158026929067e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:10:32,549] [INFO] [timer.py:259:stop] epoch=0/micro_step=14010/global_step=14010, RunningAvgSamplesPerSec=2.6330920544354255, CurrSamplesPerSec=2.6710322229634107, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:10:47,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=14020, skipped=0, lr=[9.526442559409731e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:10:47,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=14020/global_step=14020, RunningAvgSamplesPerSec=2.6331011662904475, CurrSamplesPerSec=2.6597359612550497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:11:03,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=14030, skipped=0, lr=[9.525726577911381e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:11:03,007] [INFO] [timer.py:259:stop] epoch=0/micro_step=14030/global_step=14030, RunningAvgSamplesPerSec=2.6331020549358306, CurrSamplesPerSec=2.6392677612132385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:11:18,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=14040, skipped=0, lr=[9.525010082515319e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:11:18,218] [INFO] [timer.py:259:stop] epoch=0/micro_step=14040/global_step=14040, RunningAvgSamplesPerSec=2.6331101232426213, CurrSamplesPerSec=2.6310089820091656, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:11:33,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=14050, skipped=0, lr=[9.524293073302901e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:11:33,361] [INFO] [timer.py:259:stop] epoch=0/micro_step=14050/global_step=14050, RunningAvgSamplesPerSec=2.6331260149667672, CurrSamplesPerSec=2.632152781910216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:11:48,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=14060, skipped=0, lr=[9.523575550355544e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:11:48,579] [INFO] [timer.py:259:stop] epoch=0/micro_step=14060/global_step=14060, RunningAvgSamplesPerSec=2.633133435121514, CurrSamplesPerSec=2.581883847394518, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:12:03,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=14070, skipped=0, lr=[9.522857513754724e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:12:03,804] [INFO] [timer.py:259:stop] epoch=0/micro_step=14070/global_step=14070, RunningAvgSamplesPerSec=2.6331371747441046, CurrSamplesPerSec=2.6414655303571264, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:12:19,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=14080, skipped=0, lr=[9.522138963581974e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:12:19,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=14080/global_step=14080, RunningAvgSamplesPerSec=2.6331433407470497, CurrSamplesPerSec=2.621475556095343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:12:34,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=14090, skipped=0, lr=[9.521419899918886e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:12:34,203] [INFO] [timer.py:259:stop] epoch=0/micro_step=14090/global_step=14090, RunningAvgSamplesPerSec=2.633155682676636, CurrSamplesPerSec=2.660793470394424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:12:49,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=14100, skipped=0, lr=[9.520700322847108e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:12:49,419] [INFO] [timer.py:259:stop] epoch=0/micro_step=14100/global_step=14100, RunningAvgSamplesPerSec=2.63316077685718, CurrSamplesPerSec=2.643925264767307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:13:04,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=14110, skipped=0, lr=[9.519980232448349e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:13:04,651] [INFO] [timer.py:259:stop] epoch=0/micro_step=14110/global_step=14110, RunningAvgSamplesPerSec=2.633169713884452, CurrSamplesPerSec=2.653202064665411, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:13:19,868] [INFO] [logging.py:96:log_dist] [Rank 0] step=14120, skipped=0, lr=[9.519259628804378e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:13:19,870] [INFO] [timer.py:259:stop] epoch=0/micro_step=14120/global_step=14120, RunningAvgSamplesPerSec=2.633177171441426, CurrSamplesPerSec=2.6349920133000464, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:13:35,080] [INFO] [logging.py:96:log_dist] [Rank 0] step=14130, skipped=0, lr=[9.518538511997015e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:13:35,083] [INFO] [timer.py:259:stop] epoch=0/micro_step=14130/global_step=14130, RunningAvgSamplesPerSec=2.633183107508215, CurrSamplesPerSec=2.6440048486078505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:13:50,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=14140, skipped=0, lr=[9.517816882108149e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:13:50,298] [INFO] [timer.py:259:stop] epoch=0/micro_step=14140/global_step=14140, RunningAvgSamplesPerSec=2.6331900914390096, CurrSamplesPerSec=2.625924095971456, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:14:05,506] [INFO] [logging.py:96:log_dist] [Rank 0] step=14150, skipped=0, lr=[9.517094739219715e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:14:05,508] [INFO] [timer.py:259:stop] epoch=0/micro_step=14150/global_step=14150, RunningAvgSamplesPerSec=2.6331982303960166, CurrSamplesPerSec=2.630121375470894, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:14:20,703] [INFO] [logging.py:96:log_dist] [Rank 0] step=14160, skipped=0, lr=[9.516372083413717e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:14:20,717] [INFO] [timer.py:259:stop] epoch=0/micro_step=14160/global_step=14160, RunningAvgSamplesPerSec=2.633208537945018, CurrSamplesPerSec=2.6701072121232623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:14:35,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=14170, skipped=0, lr=[9.51564891477221e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:14:35,881] [INFO] [timer.py:259:stop] epoch=0/micro_step=14170/global_step=14170, RunningAvgSamplesPerSec=2.633219931573188, CurrSamplesPerSec=2.654858783985272, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:14:51,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=14180, skipped=0, lr=[9.514925233377314e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:14:51,096] [INFO] [timer.py:259:stop] epoch=0/micro_step=14180/global_step=14180, RunningAvgSamplesPerSec=2.633225333019738, CurrSamplesPerSec=2.653613322462835, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:15:06,265] [INFO] [logging.py:96:log_dist] [Rank 0] step=14190, skipped=0, lr=[9.5142010393112e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:15:06,267] [INFO] [timer.py:259:stop] epoch=0/micro_step=14190/global_step=14190, RunningAvgSamplesPerSec=2.6332354746242363, CurrSamplesPerSec=2.6715226195000126, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:15:21,408] [INFO] [logging.py:96:log_dist] [Rank 0] step=14200, skipped=0, lr=[9.5134763326561e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:15:21,420] [INFO] [timer.py:259:stop] epoch=0/micro_step=14200/global_step=14200, RunningAvgSamplesPerSec=2.6332486360265164, CurrSamplesPerSec=2.6467210684512614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:15:36,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=14210, skipped=0, lr=[9.512751113494308e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:15:36,618] [INFO] [timer.py:259:stop] epoch=0/micro_step=14210/global_step=14210, RunningAvgSamplesPerSec=2.633255642137645, CurrSamplesPerSec=2.6429922829233616, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:15:51,810] [INFO] [logging.py:96:log_dist] [Rank 0] step=14220, skipped=0, lr=[9.512025381908169e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:15:51,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=14220/global_step=14220, RunningAvgSamplesPerSec=2.6332601991086353, CurrSamplesPerSec=2.6502033215583847, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:16:07,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=14230, skipped=0, lr=[9.511299137980092e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:16:07,034] [INFO] [timer.py:259:stop] epoch=0/micro_step=14230/global_step=14230, RunningAvgSamplesPerSec=2.6332677519537424, CurrSamplesPerSec=2.6397195653922734, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:16:22,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=14240, skipped=0, lr=[9.510572381792542e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:16:22,272] [INFO] [timer.py:259:stop] epoch=0/micro_step=14240/global_step=14240, RunningAvgSamplesPerSec=2.633271024272699, CurrSamplesPerSec=2.635722239866751, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:16:37,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=14250, skipped=0, lr=[9.509845113428042e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:16:37,429] [INFO] [timer.py:259:stop] epoch=0/micro_step=14250/global_step=14250, RunningAvgSamplesPerSec=2.633283390322776, CurrSamplesPerSec=2.672966783055909, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:16:52,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=14260, skipped=0, lr=[9.509117332969175e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:16:52,612] [INFO] [timer.py:259:stop] epoch=0/micro_step=14260/global_step=14260, RunningAvgSamplesPerSec=2.633293795052593, CurrSamplesPerSec=2.6584218908189667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:17:07,772] [INFO] [logging.py:96:log_dist] [Rank 0] step=14270, skipped=0, lr=[9.508389040498578e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:17:07,791] [INFO] [timer.py:259:stop] epoch=0/micro_step=14270/global_step=14270, RunningAvgSamplesPerSec=2.633302785863613, CurrSamplesPerSec=2.645315545811384, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:17:22,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=14280, skipped=0, lr=[9.507660236098953e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:17:22,974] [INFO] [timer.py:259:stop] epoch=0/micro_step=14280/global_step=14280, RunningAvgSamplesPerSec=2.6333144675326006, CurrSamplesPerSec=2.6592840240044935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:17:38,095] [INFO] [logging.py:96:log_dist] [Rank 0] step=14290, skipped=0, lr=[9.506930919853052e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:17:38,112] [INFO] [timer.py:259:stop] epoch=0/micro_step=14290/global_step=14290, RunningAvgSamplesPerSec=2.633328395006791, CurrSamplesPerSec=2.6077066780824056, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:17:53,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=14300, skipped=0, lr=[9.506201091843691e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:17:53,311] [INFO] [timer.py:259:stop] epoch=0/micro_step=14300/global_step=14300, RunningAvgSamplesPerSec=2.6333369150067663, CurrSamplesPerSec=2.66213269724623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:18:08,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=14310, skipped=0, lr=[9.505470752153741e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:18:08,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=14310/global_step=14310, RunningAvgSamplesPerSec=2.6333441031755664, CurrSamplesPerSec=2.602552708005869, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:18:23,704] [INFO] [logging.py:96:log_dist] [Rank 0] step=14320, skipped=0, lr=[9.504739900866134e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:18:23,706] [INFO] [timer.py:259:stop] epoch=0/micro_step=14320/global_step=14320, RunningAvgSamplesPerSec=2.633355982490281, CurrSamplesPerSec=2.644163613930977, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:18:38,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=14330, skipped=0, lr=[9.504008538063857e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:18:38,872] [INFO] [timer.py:259:stop] epoch=0/micro_step=14330/global_step=14330, RunningAvgSamplesPerSec=2.633367833646501, CurrSamplesPerSec=2.6301659065064213, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:18:54,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=14340, skipped=0, lr=[9.503276663829958e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:18:54,085] [INFO] [timer.py:259:stop] epoch=0/micro_step=14340/global_step=14340, RunningAvgSamplesPerSec=2.633374156281452, CurrSamplesPerSec=2.6452137785258105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:19:09,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=14350, skipped=0, lr=[9.502544278247541e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:19:09,308] [INFO] [timer.py:259:stop] epoch=0/micro_step=14350/global_step=14350, RunningAvgSamplesPerSec=2.6333796454416705, CurrSamplesPerSec=2.5895656057312437, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:19:24,524] [INFO] [logging.py:96:log_dist] [Rank 0] step=14360, skipped=0, lr=[9.501811381399766e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:19:24,526] [INFO] [timer.py:259:stop] epoch=0/micro_step=14360/global_step=14360, RunningAvgSamplesPerSec=2.633387180242874, CurrSamplesPerSec=2.6109211720604946, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:19:39,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=14370, skipped=0, lr=[9.501077973369856e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:19:39,645] [INFO] [timer.py:259:stop] epoch=0/micro_step=14370/global_step=14370, RunningAvgSamplesPerSec=2.63340405600163, CurrSamplesPerSec=2.6663677998566904, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:19:54,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=14380, skipped=0, lr=[9.500344054241091e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:19:54,792] [INFO] [timer.py:259:stop] epoch=0/micro_step=14380/global_step=14380, RunningAvgSamplesPerSec=2.6334180276863797, CurrSamplesPerSec=2.651959833482645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:20:09,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=14390, skipped=0, lr=[9.499609624096806e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:20:09,983] [INFO] [timer.py:259:stop] epoch=0/micro_step=14390/global_step=14390, RunningAvgSamplesPerSec=2.633426352514616, CurrSamplesPerSec=2.63824471428139, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:20:25,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=14400, skipped=0, lr=[9.498874683020396e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:20:25,135] [INFO] [timer.py:259:stop] epoch=0/micro_step=14400/global_step=14400, RunningAvgSamplesPerSec=2.6334392487095566, CurrSamplesPerSec=2.6603335805020913, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:20:40,278] [INFO] [logging.py:96:log_dist] [Rank 0] step=14410, skipped=0, lr=[9.498139231095314e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:20:40,299] [INFO] [timer.py:259:stop] epoch=0/micro_step=14410/global_step=14410, RunningAvgSamplesPerSec=2.633451201051934, CurrSamplesPerSec=2.655890134271343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:20:55,489] [INFO] [logging.py:96:log_dist] [Rank 0] step=14420, skipped=0, lr=[9.49740326840507e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:20:55,497] [INFO] [timer.py:259:stop] epoch=0/micro_step=14420/global_step=14420, RunningAvgSamplesPerSec=2.6334581840822313, CurrSamplesPerSec=2.648159434521288, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:21:10,691] [INFO] [logging.py:96:log_dist] [Rank 0] step=14430, skipped=0, lr=[9.496666795033234e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:21:10,693] [INFO] [timer.py:259:stop] epoch=0/micro_step=14430/global_step=14430, RunningAvgSamplesPerSec=2.6334650588839863, CurrSamplesPerSec=2.6468784898015825, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:21:25,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=14440, skipped=0, lr=[9.495929811063432e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:21:25,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=14440/global_step=14440, RunningAvgSamplesPerSec=2.6334802326815825, CurrSamplesPerSec=2.6550688555750193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:21:41,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=14450, skipped=0, lr=[9.495192316579348e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:21:41,024] [INFO] [timer.py:259:stop] epoch=0/micro_step=14450/global_step=14450, RunningAvgSamplesPerSec=2.633490120895356, CurrSamplesPerSec=2.6653282976188457, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:21:56,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=14460, skipped=0, lr=[9.494454311664729e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:21:56,145] [INFO] [timer.py:259:stop] epoch=0/micro_step=14460/global_step=14460, RunningAvgSamplesPerSec=2.6335059950662836, CurrSamplesPerSec=2.656869271067129, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:22:11,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=14470, skipped=0, lr=[9.493715796403372e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:22:11,354] [INFO] [timer.py:259:stop] epoch=0/micro_step=14470/global_step=14470, RunningAvgSamplesPerSec=2.6335120180726523, CurrSamplesPerSec=2.6392523992682335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:22:26,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=14480, skipped=0, lr=[9.492976770879134e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:22:26,515] [INFO] [timer.py:259:stop] epoch=0/micro_step=14480/global_step=14480, RunningAvgSamplesPerSec=2.6335225701859373, CurrSamplesPerSec=2.655645462824997, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:22:41,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=14490, skipped=0, lr=[9.492237235175935e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:22:41,703] [INFO] [timer.py:259:stop] epoch=0/micro_step=14490/global_step=14490, RunningAvgSamplesPerSec=2.633532110984283, CurrSamplesPerSec=2.656745577365412, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:22:56,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=14500, skipped=0, lr=[9.491497189377745e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:22:56,833] [INFO] [timer.py:259:stop] epoch=0/micro_step=14500/global_step=14500, RunningAvgSamplesPerSec=2.6335466138702905, CurrSamplesPerSec=2.6627385784582414, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:23:11,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=14510, skipped=0, lr=[9.490756633568603e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:23:11,991] [INFO] [timer.py:259:stop] epoch=0/micro_step=14510/global_step=14510, RunningAvgSamplesPerSec=2.633557228786594, CurrSamplesPerSec=2.6437598623735394, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:23:27,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=14520, skipped=0, lr=[9.490015567832595e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:23:27,228] [INFO] [timer.py:259:stop] epoch=0/micro_step=14520/global_step=14520, RunningAvgSamplesPerSec=2.6335592550850886, CurrSamplesPerSec=2.629029600009188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:23:42,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=14530, skipped=0, lr=[9.489273992253871e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:23:42,462] [INFO] [timer.py:259:stop] epoch=0/micro_step=14530/global_step=14530, RunningAvgSamplesPerSec=2.633564132767288, CurrSamplesPerSec=2.6580306179796693, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:23:57,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=14540, skipped=0, lr=[9.488531906916636e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:23:57,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=14540/global_step=14540, RunningAvgSamplesPerSec=2.6335763065357165, CurrSamplesPerSec=2.665327027329744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:24:12,820] [INFO] [logging.py:96:log_dist] [Rank 0] step=14550, skipped=0, lr=[9.487789311905155e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:24:12,823] [INFO] [timer.py:259:stop] epoch=0/micro_step=14550/global_step=14550, RunningAvgSamplesPerSec=2.6335848952380605, CurrSamplesPerSec=2.6547121737138935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:24:27,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=14560, skipped=0, lr=[9.48704620730375e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:24:27,992] [INFO] [timer.py:259:stop] epoch=0/micro_step=14560/global_step=14560, RunningAvgSamplesPerSec=2.633596070715819, CurrSamplesPerSec=2.6272930365032683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:24:43,223] [INFO] [logging.py:96:log_dist] [Rank 0] step=14570, skipped=0, lr=[9.486302593196798e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:24:43,225] [INFO] [timer.py:259:stop] epoch=0/micro_step=14570/global_step=14570, RunningAvgSamplesPerSec=2.6336005024478046, CurrSamplesPerSec=2.644557901100735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:24:58,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=14580, skipped=0, lr=[9.48555846966874e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:24:58,475] [INFO] [timer.py:259:stop] epoch=0/micro_step=14580/global_step=14580, RunningAvgSamplesPerSec=2.6336020757337124, CurrSamplesPerSec=2.585739744111137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:25:13,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=14590, skipped=0, lr=[9.48481383680407e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:25:13,655] [INFO] [timer.py:259:stop] epoch=0/micro_step=14590/global_step=14590, RunningAvgSamplesPerSec=2.6336126066518517, CurrSamplesPerSec=2.645088248522024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:25:28,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=14600, skipped=0, lr=[9.48406869468734e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:25:28,919] [INFO] [timer.py:259:stop] epoch=0/micro_step=14600/global_step=14600, RunningAvgSamplesPerSec=2.6336158101885605, CurrSamplesPerSec=2.6398504015451683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:25:44,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=14610, skipped=0, lr=[9.483323043403165e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:25:44,102] [INFO] [timer.py:259:stop] epoch=0/micro_step=14610/global_step=14610, RunningAvgSamplesPerSec=2.633624682633783, CurrSamplesPerSec=2.6530640281269955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:25:59,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=14620, skipped=0, lr=[9.482576883036212e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:25:59,315] [INFO] [timer.py:259:stop] epoch=0/micro_step=14620/global_step=14620, RunningAvgSamplesPerSec=2.6336310109004613, CurrSamplesPerSec=2.6267558173370293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:26:14,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=14630, skipped=0, lr=[9.481830213671206e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:26:14,516] [INFO] [timer.py:259:stop] epoch=0/micro_step=14630/global_step=14630, RunningAvgSamplesPerSec=2.6336392460787126, CurrSamplesPerSec=2.6437215353013257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:26:29,752] [INFO] [logging.py:96:log_dist] [Rank 0] step=14640, skipped=0, lr=[9.481083035392933e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:26:29,753] [INFO] [timer.py:259:stop] epoch=0/micro_step=14640/global_step=14640, RunningAvgSamplesPerSec=2.6336433528847314, CurrSamplesPerSec=2.6419671791571844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:26:44,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=14650, skipped=0, lr=[9.480335348286233e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:26:44,906] [INFO] [timer.py:259:stop] epoch=0/micro_step=14650/global_step=14650, RunningAvgSamplesPerSec=2.6336555987209236, CurrSamplesPerSec=2.651392786670759, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:27:00,073] [INFO] [logging.py:96:log_dist] [Rank 0] step=14660, skipped=0, lr=[9.479587152436011e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:27:00,093] [INFO] [timer.py:259:stop] epoch=0/micro_step=14660/global_step=14660, RunningAvgSamplesPerSec=2.633664071247747, CurrSamplesPerSec=2.6550070910775063, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:27:15,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=14670, skipped=0, lr=[9.478838447927223e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:27:15,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=14670/global_step=14670, RunningAvgSamplesPerSec=2.6336758991777414, CurrSamplesPerSec=2.6696096887724377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:27:30,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=14680, skipped=0, lr=[9.478089234844882e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:27:30,469] [INFO] [timer.py:259:stop] epoch=0/micro_step=14680/global_step=14680, RunningAvgSamplesPerSec=2.6336796466907013, CurrSamplesPerSec=2.638961386474171, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:27:45,575] [INFO] [logging.py:96:log_dist] [Rank 0] step=14690, skipped=0, lr=[9.477339513274065e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:27:45,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=14690/global_step=14690, RunningAvgSamplesPerSec=2.633698189311739, CurrSamplesPerSec=2.6546046418038136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:28:00,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=14700, skipped=0, lr=[9.4765892832999e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:28:00,818] [INFO] [timer.py:259:stop] epoch=0/micro_step=14700/global_step=14700, RunningAvgSamplesPerSec=2.633703291740667, CurrSamplesPerSec=2.6372087801843564, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:28:16,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=14710, skipped=0, lr=[9.475838545007577e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:28:16,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=14710/global_step=14710, RunningAvgSamplesPerSec=2.633711612266556, CurrSamplesPerSec=2.672379650600198, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:28:31,221] [INFO] [logging.py:96:log_dist] [Rank 0] step=14720, skipped=0, lr=[9.475087298482343e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:28:31,225] [INFO] [timer.py:259:stop] epoch=0/micro_step=14720/global_step=14720, RunningAvgSamplesPerSec=2.6337174935327927, CurrSamplesPerSec=2.6440085987499478, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:28:46,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=14730, skipped=0, lr=[9.474335543809503e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:28:46,425] [INFO] [timer.py:259:stop] epoch=0/micro_step=14730/global_step=14730, RunningAvgSamplesPerSec=2.6337242117845263, CurrSamplesPerSec=2.6546516859425457, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:29:01,630] [INFO] [logging.py:96:log_dist] [Rank 0] step=14740, skipped=0, lr=[9.473583281074416e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:29:01,631] [INFO] [timer.py:259:stop] epoch=0/micro_step=14740/global_step=14740, RunningAvgSamplesPerSec=2.6337319833895414, CurrSamplesPerSec=2.6425847266227454, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:29:16,788] [INFO] [logging.py:96:log_dist] [Rank 0] step=14750, skipped=0, lr=[9.472830510362506e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:29:16,790] [INFO] [timer.py:259:stop] epoch=0/micro_step=14750/global_step=14750, RunningAvgSamplesPerSec=2.6337437077151864, CurrSamplesPerSec=2.6425114714032887, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:29:31,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=14760, skipped=0, lr=[9.472077231759247e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:29:32,001] [INFO] [timer.py:259:stop] epoch=0/micro_step=14760/global_step=14760, RunningAvgSamplesPerSec=2.6337492043770987, CurrSamplesPerSec=2.662832823258695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:29:47,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=14770, skipped=0, lr=[9.471323445350176e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:29:47,161] [INFO] [timer.py:259:stop] epoch=0/micro_step=14770/global_step=14770, RunningAvgSamplesPerSec=2.6337626026098704, CurrSamplesPerSec=2.6537782810501698, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:30:02,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=14780, skipped=0, lr=[9.470569151220884e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:30:02,407] [INFO] [timer.py:259:stop] epoch=0/micro_step=14780/global_step=14780, RunningAvgSamplesPerSec=2.633764196207198, CurrSamplesPerSec=2.622507772764387, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:30:17,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=14790, skipped=0, lr=[9.469814349457022e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:30:17,611] [INFO] [timer.py:259:stop] epoch=0/micro_step=14790/global_step=14790, RunningAvgSamplesPerSec=2.633772435698161, CurrSamplesPerSec=2.6480687332822237, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:30:32,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=14800, skipped=0, lr=[9.4690590401443e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:30:32,789] [INFO] [timer.py:259:stop] epoch=0/micro_step=14800/global_step=14800, RunningAvgSamplesPerSec=2.633782912154249, CurrSamplesPerSec=2.642313785825849, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:30:47,992] [INFO] [logging.py:96:log_dist] [Rank 0] step=14810, skipped=0, lr=[9.468303223368479e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:30:47,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=14810/global_step=14810, RunningAvgSamplesPerSec=2.6337900874819162, CurrSamplesPerSec=2.600272887168883, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:31:03,157] [INFO] [logging.py:96:log_dist] [Rank 0] step=14820, skipped=0, lr=[9.467546899215386e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:31:03,158] [INFO] [timer.py:259:stop] epoch=0/micro_step=14820/global_step=14820, RunningAvgSamplesPerSec=2.6338015745514305, CurrSamplesPerSec=2.6639033713327414, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:31:18,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=14830, skipped=0, lr=[9.466790067770901e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:31:18,342] [INFO] [timer.py:259:stop] epoch=0/micro_step=14830/global_step=14830, RunningAvgSamplesPerSec=2.6338093915119414, CurrSamplesPerSec=2.590079721734368, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:31:33,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=14840, skipped=0, lr=[9.466032729120963e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:31:33,558] [INFO] [timer.py:259:stop] epoch=0/micro_step=14840/global_step=14840, RunningAvgSamplesPerSec=2.6338188659359383, CurrSamplesPerSec=2.642718760889899, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:31:48,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=14850, skipped=0, lr=[9.465274883351568e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:31:48,719] [INFO] [timer.py:259:stop] epoch=0/micro_step=14850/global_step=14850, RunningAvgSamplesPerSec=2.633828943375665, CurrSamplesPerSec=2.587640484880783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:32:03,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=14860, skipped=0, lr=[9.464516530548766e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:32:03,843] [INFO] [timer.py:259:stop] epoch=0/micro_step=14860/global_step=14860, RunningAvgSamplesPerSec=2.6338432669303504, CurrSamplesPerSec=2.655663538382678, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:32:18,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=14870, skipped=0, lr=[9.463757670798674e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:32:18,972] [INFO] [timer.py:259:stop] epoch=0/micro_step=14870/global_step=14870, RunningAvgSamplesPerSec=2.6338572955740736, CurrSamplesPerSec=2.611695033614764, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:32:34,155] [INFO] [logging.py:96:log_dist] [Rank 0] step=14880, skipped=0, lr=[9.462998304187456e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:32:34,157] [INFO] [timer.py:259:stop] epoch=0/micro_step=14880/global_step=14880, RunningAvgSamplesPerSec=2.633866104382625, CurrSamplesPerSec=2.642136102139822, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:32:49,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=14890, skipped=0, lr=[9.462238430801341e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:32:49,369] [INFO] [timer.py:259:stop] epoch=0/micro_step=14890/global_step=14890, RunningAvgSamplesPerSec=2.633872880296816, CurrSamplesPerSec=2.6231502963278097, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:33:04,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=14900, skipped=0, lr=[9.461478050726613e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:33:04,517] [INFO] [timer.py:259:stop] epoch=0/micro_step=14900/global_step=14900, RunningAvgSamplesPerSec=2.6338853092452403, CurrSamplesPerSec=2.667355526996219, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:33:19,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=14910, skipped=0, lr=[9.460717164049611e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:33:19,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=14910/global_step=14910, RunningAvgSamplesPerSec=2.633888353102614, CurrSamplesPerSec=2.6497730317216104, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:33:34,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=14920, skipped=0, lr=[9.459955770856736e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:33:34,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=14920/global_step=14920, RunningAvgSamplesPerSec=2.6339043377710785, CurrSamplesPerSec=2.665939870694316, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:33:50,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=14930, skipped=0, lr=[9.459193871234447e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:33:50,037] [INFO] [timer.py:259:stop] epoch=0/micro_step=14930/global_step=14930, RunningAvgSamplesPerSec=2.6339141892262257, CurrSamplesPerSec=2.643918598275627, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:34:05,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=14940, skipped=0, lr=[9.45843146526925e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:34:05,220] [INFO] [timer.py:259:stop] epoch=0/micro_step=14940/global_step=14940, RunningAvgSamplesPerSec=2.6339219417440587, CurrSamplesPerSec=2.6537530952064854, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:34:20,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=14950, skipped=0, lr=[9.457668553047725e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:34:20,390] [INFO] [timer.py:259:stop] epoch=0/micro_step=14950/global_step=14950, RunningAvgSamplesPerSec=2.633934035116277, CurrSamplesPerSec=2.655633692826692, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:34:35,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=14960, skipped=0, lr=[9.456905134656495e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:34:35,520] [INFO] [timer.py:259:stop] epoch=0/micro_step=14960/global_step=14960, RunningAvgSamplesPerSec=2.633949991006982, CurrSamplesPerSec=2.6579190273563484, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:34:50,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=14970, skipped=0, lr=[9.45614121018225e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:34:50,743] [INFO] [timer.py:259:stop] epoch=0/micro_step=14970/global_step=14970, RunningAvgSamplesPerSec=2.633954051135158, CurrSamplesPerSec=2.652171962090786, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:35:05,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=14980, skipped=0, lr=[9.455376779711731e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:35:05,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=14980/global_step=14980, RunningAvgSamplesPerSec=2.633963248541433, CurrSamplesPerSec=2.6552827426038883, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:35:21,130] [INFO] [logging.py:96:log_dist] [Rank 0] step=14990, skipped=0, lr=[9.454611843331743e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:35:21,137] [INFO] [timer.py:259:stop] epoch=0/micro_step=14990/global_step=14990, RunningAvgSamplesPerSec=2.6339685173070397, CurrSamplesPerSec=2.654186358331435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:35:36,252] [INFO] [logging.py:96:log_dist] [Rank 0] step=15000, skipped=0, lr=[9.45384640112914e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:35:36,254] [INFO] [timer.py:259:stop] epoch=0/micro_step=15000/global_step=15000, RunningAvgSamplesPerSec=2.633984000521411, CurrSamplesPerSec=2.649403964439384, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:35:51,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=15010, skipped=0, lr=[9.453080453190844e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:35:51,479] [INFO] [timer.py:259:stop] epoch=0/micro_step=15010/global_step=15010, RunningAvgSamplesPerSec=2.633986607221158, CurrSamplesPerSec=2.6529025142447873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:36:06,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=15020, skipped=0, lr=[9.452313999603823e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:36:06,655] [INFO] [timer.py:259:stop] epoch=0/micro_step=15020/global_step=15020, RunningAvgSamplesPerSec=2.6339959632698453, CurrSamplesPerSec=2.6562176946261684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:36:21,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=15030, skipped=0, lr=[9.451547040455111e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:36:21,848] [INFO] [timer.py:259:stop] epoch=0/micro_step=15030/global_step=15030, RunningAvgSamplesPerSec=2.6340027270685167, CurrSamplesPerSec=2.6542102926849536, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:36:37,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=15040, skipped=0, lr=[9.450779575831797e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:36:37,018] [INFO] [timer.py:259:stop] epoch=0/micro_step=15040/global_step=15040, RunningAvgSamplesPerSec=2.6340119590629327, CurrSamplesPerSec=2.6752517721601796, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:36:52,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=15050, skipped=0, lr=[9.450011605821026e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:36:52,117] [INFO] [timer.py:259:stop] epoch=0/micro_step=15050/global_step=15050, RunningAvgSamplesPerSec=2.6340291840889236, CurrSamplesPerSec=2.6668119745787062, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:37:07,275] [INFO] [logging.py:96:log_dist] [Rank 0] step=15060, skipped=0, lr=[9.44924313051e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:37:07,277] [INFO] [timer.py:259:stop] epoch=0/micro_step=15060/global_step=15060, RunningAvgSamplesPerSec=2.634040699969499, CurrSamplesPerSec=2.665021346293559, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:37:22,479] [INFO] [logging.py:96:log_dist] [Rank 0] step=15070, skipped=0, lr=[9.44847414998598e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:37:22,480] [INFO] [timer.py:259:stop] epoch=0/micro_step=15070/global_step=15070, RunningAvgSamplesPerSec=2.634049142319713, CurrSamplesPerSec=2.659244402502415, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:37:37,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=15080, skipped=0, lr=[9.447704664336286e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:37:37,611] [INFO] [timer.py:259:stop] epoch=0/micro_step=15080/global_step=15080, RunningAvgSamplesPerSec=2.634062917326868, CurrSamplesPerSec=2.6584336855453707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:37:52,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=15090, skipped=0, lr=[9.446934673648292e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:37:52,832] [INFO] [timer.py:259:stop] epoch=0/micro_step=15090/global_step=15090, RunningAvgSamplesPerSec=2.6340690202017014, CurrSamplesPerSec=2.6530665453820927, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:38:08,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=15100, skipped=0, lr=[9.44616417800943e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:38:08,012] [INFO] [timer.py:259:stop] epoch=0/micro_step=15100/global_step=15100, RunningAvgSamplesPerSec=2.6340771992469487, CurrSamplesPerSec=2.6282577851669284, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:38:23,146] [INFO] [logging.py:96:log_dist] [Rank 0] step=15110, skipped=0, lr=[9.445393177507191e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:38:23,161] [INFO] [timer.py:259:stop] epoch=0/micro_step=15110/global_step=15110, RunningAvgSamplesPerSec=2.6340883491944065, CurrSamplesPerSec=2.6583991441421713, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:38:38,306] [INFO] [logging.py:96:log_dist] [Rank 0] step=15120, skipped=0, lr=[9.444621672229122e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:38:38,333] [INFO] [timer.py:259:stop] epoch=0/micro_step=15120/global_step=15120, RunningAvgSamplesPerSec=2.6340968094697215, CurrSamplesPerSec=2.6225192509595385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:38:53,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=15130, skipped=0, lr=[9.443849662262828e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:38:53,550] [INFO] [timer.py:259:stop] epoch=0/micro_step=15130/global_step=15130, RunningAvgSamplesPerSec=2.634101712130582, CurrSamplesPerSec=2.6622767486125425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:39:08,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=15140, skipped=0, lr=[9.443077147695971e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:39:08,792] [INFO] [timer.py:259:stop] epoch=0/micro_step=15140/global_step=15140, RunningAvgSamplesPerSec=2.634106307786738, CurrSamplesPerSec=2.6096617831758193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:39:23,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=15150, skipped=0, lr=[9.44230412861627e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:39:23,964] [INFO] [timer.py:259:stop] epoch=0/micro_step=15150/global_step=15150, RunningAvgSamplesPerSec=2.6341164225162044, CurrSamplesPerSec=2.651728878272947, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:39:39,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=15160, skipped=0, lr=[9.441530605111501e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:39:39,186] [INFO] [timer.py:259:stop] epoch=0/micro_step=15160/global_step=15160, RunningAvgSamplesPerSec=2.6341218420690504, CurrSamplesPerSec=2.621295749151258, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:39:54,391] [INFO] [logging.py:96:log_dist] [Rank 0] step=15170, skipped=0, lr=[9.440756577269499e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:39:54,393] [INFO] [timer.py:259:stop] epoch=0/micro_step=15170/global_step=15170, RunningAvgSamplesPerSec=2.634129223776666, CurrSamplesPerSec=2.6623887054544744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:40:09,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=15180, skipped=0, lr=[9.439982045178154e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:40:09,503] [INFO] [timer.py:259:stop] epoch=0/micro_step=15180/global_step=15180, RunningAvgSamplesPerSec=2.634146893717427, CurrSamplesPerSec=2.6724102994297154, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:40:24,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=15190, skipped=0, lr=[9.439207008925415e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:40:24,652] [INFO] [timer.py:259:stop] epoch=0/micro_step=15190/global_step=15190, RunningAvgSamplesPerSec=2.6341604177070246, CurrSamplesPerSec=2.6700676923882116, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:40:39,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=15200, skipped=0, lr=[9.438431468599288e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:40:39,890] [INFO] [timer.py:259:stop] epoch=0/micro_step=15200/global_step=15200, RunningAvgSamplesPerSec=2.634163855859161, CurrSamplesPerSec=2.6671553789636273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:40:55,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=15210, skipped=0, lr=[9.437655424287836e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:40:55,015] [INFO] [timer.py:259:stop] epoch=0/micro_step=15210/global_step=15210, RunningAvgSamplesPerSec=2.6341795678228705, CurrSamplesPerSec=2.6586780284180658, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:41:10,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=15220, skipped=0, lr=[9.436878876079177e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:41:10,165] [INFO] [timer.py:259:stop] epoch=0/micro_step=15220/global_step=15220, RunningAvgSamplesPerSec=2.634193045700519, CurrSamplesPerSec=2.641106673053141, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:41:25,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=15230, skipped=0, lr=[9.43610182406149e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:41:25,366] [INFO] [timer.py:259:stop] epoch=0/micro_step=15230/global_step=15230, RunningAvgSamplesPerSec=2.634203082770729, CurrSamplesPerSec=2.6538378894514714, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:41:40,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=15240, skipped=0, lr=[9.43532426832301e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:41:40,570] [INFO] [timer.py:259:stop] epoch=0/micro_step=15240/global_step=15240, RunningAvgSamplesPerSec=2.634208779001015, CurrSamplesPerSec=2.6474231367316596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:41:55,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=15250, skipped=0, lr=[9.434546208952029e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:41:55,748] [INFO] [timer.py:259:stop] epoch=0/micro_step=15250/global_step=15250, RunningAvgSamplesPerSec=2.63421889713894, CurrSamplesPerSec=2.658032723553193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:42:10,966] [INFO] [logging.py:96:log_dist] [Rank 0] step=15260, skipped=0, lr=[9.433767646036893e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:42:10,968] [INFO] [timer.py:259:stop] epoch=0/micro_step=15260/global_step=15260, RunningAvgSamplesPerSec=2.634224322151923, CurrSamplesPerSec=2.623660603077988, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:42:26,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=15270, skipped=0, lr=[9.43298857966601e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:42:26,144] [INFO] [timer.py:259:stop] epoch=0/micro_step=15270/global_step=15270, RunningAvgSamplesPerSec=2.6342327987898275, CurrSamplesPerSec=2.645260907483787, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:42:41,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=15280, skipped=0, lr=[9.432209009927843e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:42:41,363] [INFO] [timer.py:259:stop] epoch=0/micro_step=15280/global_step=15280, RunningAvgSamplesPerSec=2.6342379847735584, CurrSamplesPerSec=2.6392486626059477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:42:56,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=15290, skipped=0, lr=[9.431428936910914e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:42:56,598] [INFO] [timer.py:259:stop] epoch=0/micro_step=15290/global_step=15290, RunningAvgSamplesPerSec=2.6342400076989483, CurrSamplesPerSec=2.638351339745022, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:43:11,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=15300, skipped=0, lr=[9.4306483607038e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:43:11,833] [INFO] [timer.py:259:stop] epoch=0/micro_step=15300/global_step=15300, RunningAvgSamplesPerSec=2.6342433058821557, CurrSamplesPerSec=2.636541535518511, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:43:26,998] [INFO] [logging.py:96:log_dist] [Rank 0] step=15310, skipped=0, lr=[9.429867281395134e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:43:27,000] [INFO] [timer.py:259:stop] epoch=0/micro_step=15310/global_step=15310, RunningAvgSamplesPerSec=2.6342548188528814, CurrSamplesPerSec=2.6693353017784216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:43:42,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=15320, skipped=0, lr=[9.429085699073606e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:43:42,197] [INFO] [timer.py:259:stop] epoch=0/micro_step=15320/global_step=15320, RunningAvgSamplesPerSec=2.6342616751058223, CurrSamplesPerSec=2.6542669810336035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:43:57,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=15330, skipped=0, lr=[9.428303613827969e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:43:57,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=15330/global_step=15330, RunningAvgSamplesPerSec=2.634268620941835, CurrSamplesPerSec=2.652412219851857, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:44:12,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=15340, skipped=0, lr=[9.427521025747028e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:44:12,583] [INFO] [timer.py:259:stop] epoch=0/micro_step=15340/global_step=15340, RunningAvgSamplesPerSec=2.634276626327988, CurrSamplesPerSec=2.6552911475129752, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:44:27,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=15350, skipped=0, lr=[9.426737934919648e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:44:27,733] [INFO] [timer.py:259:stop] epoch=0/micro_step=15350/global_step=15350, RunningAvgSamplesPerSec=2.6342874191424275, CurrSamplesPerSec=2.652847142583554, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:44:42,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=15360, skipped=0, lr=[9.425954341434744e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:44:42,915] [INFO] [timer.py:259:stop] epoch=0/micro_step=15360/global_step=15360, RunningAvgSamplesPerSec=2.6342948385256957, CurrSamplesPerSec=2.641437666565027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:44:58,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=15370, skipped=0, lr=[9.425170245381296e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:44:58,074] [INFO] [timer.py:259:stop] epoch=0/micro_step=15370/global_step=15370, RunningAvgSamplesPerSec=2.634303947707575, CurrSamplesPerSec=2.6164335385162234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:45:13,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=15380, skipped=0, lr=[9.42438564684834e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:45:13,321] [INFO] [timer.py:259:stop] epoch=0/micro_step=15380/global_step=15380, RunningAvgSamplesPerSec=2.6343071729219134, CurrSamplesPerSec=2.6489392198543182, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:45:28,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=15390, skipped=0, lr=[9.423600545924966e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:45:28,501] [INFO] [timer.py:259:stop] epoch=0/micro_step=15390/global_step=15390, RunningAvgSamplesPerSec=2.6343166813975225, CurrSamplesPerSec=2.6036064372995242, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:45:43,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=15400, skipped=0, lr=[9.422814942700322e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:45:43,689] [INFO] [timer.py:259:stop] epoch=0/micro_step=15400/global_step=15400, RunningAvgSamplesPerSec=2.6343237446902092, CurrSamplesPerSec=2.654347608633859, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:45:58,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=15410, skipped=0, lr=[9.422028837263612e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:45:58,871] [INFO] [timer.py:259:stop] epoch=0/micro_step=15410/global_step=15410, RunningAvgSamplesPerSec=2.6343326895414543, CurrSamplesPerSec=2.602109903633002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:46:14,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=15420, skipped=0, lr=[9.421242229704104e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:46:14,055] [INFO] [timer.py:259:stop] epoch=0/micro_step=15420/global_step=15420, RunningAvgSamplesPerSec=2.6343420576864562, CurrSamplesPerSec=2.649866779468477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:46:29,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=15430, skipped=0, lr=[9.420455120111111e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:46:29,258] [INFO] [timer.py:259:stop] epoch=0/micro_step=15430/global_step=15430, RunningAvgSamplesPerSec=2.6343479542423585, CurrSamplesPerSec=2.6184830803120076, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:46:44,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=15440, skipped=0, lr=[9.419667508574015e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:46:44,446] [INFO] [timer.py:259:stop] epoch=0/micro_step=15440/global_step=15440, RunningAvgSamplesPerSec=2.6343547139903403, CurrSamplesPerSec=2.6547894676554504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:46:59,643] [INFO] [logging.py:96:log_dist] [Rank 0] step=15450, skipped=0, lr=[9.418879395182246e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:46:59,644] [INFO] [timer.py:259:stop] epoch=0/micro_step=15450/global_step=15450, RunningAvgSamplesPerSec=2.63436249906118, CurrSamplesPerSec=2.647104006587792, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:47:14,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=15460, skipped=0, lr=[9.418090780025295e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:47:14,827] [INFO] [timer.py:259:stop] epoch=0/micro_step=15460/global_step=15460, RunningAvgSamplesPerSec=2.6343713724557065, CurrSamplesPerSec=2.6806898663860537, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:47:30,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=15470, skipped=0, lr=[9.417301663192711e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:47:30,072] [INFO] [timer.py:259:stop] epoch=0/micro_step=15470/global_step=15470, RunningAvgSamplesPerSec=2.6343728261286006, CurrSamplesPerSec=2.6227025057529123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:47:45,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=15480, skipped=0, lr=[9.416512044774097e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:47:45,240] [INFO] [timer.py:259:stop] epoch=0/micro_step=15480/global_step=15480, RunningAvgSamplesPerSec=2.6343814573629314, CurrSamplesPerSec=2.636719296283777, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:48:00,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=15490, skipped=0, lr=[9.415721924859117e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:48:00,442] [INFO] [timer.py:259:stop] epoch=0/micro_step=15490/global_step=15490, RunningAvgSamplesPerSec=2.634387756272034, CurrSamplesPerSec=2.6506492458880864, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:48:15,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=15500, skipped=0, lr=[9.414931303537484e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:48:15,604] [INFO] [timer.py:259:stop] epoch=0/micro_step=15500/global_step=15500, RunningAvgSamplesPerSec=2.6343969417051305, CurrSamplesPerSec=2.6410804798729015, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:48:30,844] [INFO] [logging.py:96:log_dist] [Rank 0] step=15510, skipped=0, lr=[9.414140180898981e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:48:30,866] [INFO] [timer.py:259:stop] epoch=0/micro_step=15510/global_step=15510, RunningAvgSamplesPerSec=2.6343978155495362, CurrSamplesPerSec=2.630452095623966, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:48:46,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=15520, skipped=0, lr=[9.413348557033433e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:48:46,062] [INFO] [timer.py:259:stop] epoch=0/micro_step=15520/global_step=15520, RunningAvgSamplesPerSec=2.6344061577001727, CurrSamplesPerSec=2.639444228812259, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:49:01,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=15530, skipped=0, lr=[9.412556432030734e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:49:01,268] [INFO] [timer.py:259:stop] epoch=0/micro_step=15530/global_step=15530, RunningAvgSamplesPerSec=2.6344125285160622, CurrSamplesPerSec=2.6624360258940327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:49:16,369] [INFO] [logging.py:96:log_dist] [Rank 0] step=15540, skipped=0, lr=[9.411763805980825e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:49:16,381] [INFO] [timer.py:259:stop] epoch=0/micro_step=15540/global_step=15540, RunningAvgSamplesPerSec=2.6344266572588912, CurrSamplesPerSec=2.6755465774878058, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:49:31,555] [INFO] [logging.py:96:log_dist] [Rank 0] step=15550, skipped=0, lr=[9.410970678973714e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:49:31,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=15550/global_step=15550, RunningAvgSamplesPerSec=2.634434362451058, CurrSamplesPerSec=2.6478305153798423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:49:46,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=15560, skipped=0, lr=[9.410177051099459e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:49:46,727] [INFO] [timer.py:259:stop] epoch=0/micro_step=15560/global_step=15560, RunningAvgSamplesPerSec=2.634443710536002, CurrSamplesPerSec=2.6672257665846333, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:50:01,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=15570, skipped=0, lr=[9.409382922448176e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:50:01,940] [INFO] [timer.py:259:stop] epoch=0/micro_step=15570/global_step=15570, RunningAvgSamplesPerSec=2.6344493299703484, CurrSamplesPerSec=2.6556984291084955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:50:17,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=15580, skipped=0, lr=[9.40858829311004e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:50:17,026] [INFO] [timer.py:259:stop] epoch=0/micro_step=15580/global_step=15580, RunningAvgSamplesPerSec=2.634466203535652, CurrSamplesPerSec=2.67883436347593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:50:32,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=15590, skipped=0, lr=[9.40779316317528e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:50:32,260] [INFO] [timer.py:259:stop] epoch=0/micro_step=15590/global_step=15590, RunningAvgSamplesPerSec=2.6344671887461373, CurrSamplesPerSec=2.6083143937234103, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:50:47,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=15600, skipped=0, lr=[9.406997532734184e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:50:47,473] [INFO] [timer.py:259:stop] epoch=0/micro_step=15600/global_step=15600, RunningAvgSamplesPerSec=2.6344703335567226, CurrSamplesPerSec=2.64410152233494, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:51:02,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=15610, skipped=0, lr=[9.406201401877097e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:51:02,685] [INFO] [timer.py:259:stop] epoch=0/micro_step=15610/global_step=15610, RunningAvgSamplesPerSec=2.634473768757683, CurrSamplesPerSec=2.654752500426203, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:51:17,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=15620, skipped=0, lr=[9.405404770694417e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:51:17,919] [INFO] [timer.py:259:stop] epoch=0/micro_step=15620/global_step=15620, RunningAvgSamplesPerSec=2.6344763859473916, CurrSamplesPerSec=2.6263861440432534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:51:33,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=15630, skipped=0, lr=[9.404607639276606e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:51:33,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=15630/global_step=15630, RunningAvgSamplesPerSec=2.6344840180471363, CurrSamplesPerSec=2.658552902405017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:51:48,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=15640, skipped=0, lr=[9.403810007714173e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:51:48,362] [INFO] [timer.py:259:stop] epoch=0/micro_step=15640/global_step=15640, RunningAvgSamplesPerSec=2.634483858326162, CurrSamplesPerSec=2.5794528774747163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:52:03,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=15650, skipped=0, lr=[9.403011876097695e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:52:03,507] [INFO] [timer.py:259:stop] epoch=0/micro_step=15650/global_step=15650, RunningAvgSamplesPerSec=2.63449524760243, CurrSamplesPerSec=2.6397245493890513, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:52:18,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=15660, skipped=0, lr=[9.402213244517797e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:52:18,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=15660/global_step=15660, RunningAvgSamplesPerSec=2.634503627236974, CurrSamplesPerSec=2.6488438648908113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:52:33,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=15670, skipped=0, lr=[9.401414113065164e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:52:33,856] [INFO] [timer.py:259:stop] epoch=0/micro_step=15670/global_step=15670, RunningAvgSamplesPerSec=2.6345116260994126, CurrSamplesPerSec=2.6137823551519586, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:52:49,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=15680, skipped=0, lr=[9.40061448183054e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:52:49,107] [INFO] [timer.py:259:stop] epoch=0/micro_step=15680/global_step=15680, RunningAvgSamplesPerSec=2.634513333072507, CurrSamplesPerSec=2.601458282542659, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:53:04,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=15690, skipped=0, lr=[9.399814350904718e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:53:04,210] [INFO] [timer.py:259:stop] epoch=0/micro_step=15690/global_step=15690, RunningAvgSamplesPerSec=2.6345296143068766, CurrSamplesPerSec=2.6585777581502907, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:53:19,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=15700, skipped=0, lr=[9.399013720378558e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:53:19,421] [INFO] [timer.py:259:stop] epoch=0/micro_step=15700/global_step=15700, RunningAvgSamplesPerSec=2.63453337844377, CurrSamplesPerSec=2.6378137365270704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:53:34,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=15710, skipped=0, lr=[9.39821259034297e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:53:34,612] [INFO] [timer.py:259:stop] epoch=0/micro_step=15710/global_step=15710, RunningAvgSamplesPerSec=2.6345410232106907, CurrSamplesPerSec=2.641560770741729, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:53:49,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=15720, skipped=0, lr=[9.397410960888924e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:53:49,780] [INFO] [timer.py:259:stop] epoch=0/micro_step=15720/global_step=15720, RunningAvgSamplesPerSec=2.6345502672455128, CurrSamplesPerSec=2.6712784620308065, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:54:04,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=15730, skipped=0, lr=[9.396608832107445e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:54:04,887] [INFO] [timer.py:259:stop] epoch=0/micro_step=15730/global_step=15730, RunningAvgSamplesPerSec=2.6345653073268775, CurrSamplesPerSec=2.676834072065446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:54:20,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=15740, skipped=0, lr=[9.395806204089613e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:54:20,048] [INFO] [timer.py:259:stop] epoch=0/micro_step=15740/global_step=15740, RunningAvgSamplesPerSec=2.6345754717870022, CurrSamplesPerSec=2.6637519540739847, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:54:35,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=15750, skipped=0, lr=[9.395003076926567e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:54:35,200] [INFO] [timer.py:259:stop] epoch=0/micro_step=15750/global_step=15750, RunningAvgSamplesPerSec=2.634588285239923, CurrSamplesPerSec=2.6456025383626907, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:54:50,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=15760, skipped=0, lr=[9.394199450709505e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:54:50,400] [INFO] [timer.py:259:stop] epoch=0/micro_step=15760/global_step=15760, RunningAvgSamplesPerSec=2.634594114863612, CurrSamplesPerSec=2.628837633591769, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:55:05,594] [INFO] [logging.py:96:log_dist] [Rank 0] step=15770, skipped=0, lr=[9.393395325529678e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:55:05,597] [INFO] [timer.py:259:stop] epoch=0/micro_step=15770/global_step=15770, RunningAvgSamplesPerSec=2.634601198150079, CurrSamplesPerSec=2.6321796242319593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:55:20,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=15780, skipped=0, lr=[9.392590701478392e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:55:20,826] [INFO] [timer.py:259:stop] epoch=0/micro_step=15780/global_step=15780, RunningAvgSamplesPerSec=2.6346049903080098, CurrSamplesPerSec=2.645004846469661, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:55:36,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=15790, skipped=0, lr=[9.391785578647015e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:55:36,014] [INFO] [timer.py:259:stop] epoch=0/micro_step=15790/global_step=15790, RunningAvgSamplesPerSec=2.634611473941176, CurrSamplesPerSec=2.640297003549213, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:55:51,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=15800, skipped=0, lr=[9.390979957126969e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:55:51,161] [INFO] [timer.py:259:stop] epoch=0/micro_step=15800/global_step=15800, RunningAvgSamplesPerSec=2.6346224263047833, CurrSamplesPerSec=2.6309095501877935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:56:06,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=15810, skipped=0, lr=[9.39017383700973e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:56:06,307] [INFO] [timer.py:259:stop] epoch=0/micro_step=15810/global_step=15810, RunningAvgSamplesPerSec=2.6346343585642615, CurrSamplesPerSec=2.6743046539144673, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:56:21,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=15820, skipped=0, lr=[9.389367218386837e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:56:21,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=15820/global_step=15820, RunningAvgSamplesPerSec=2.63464641005169, CurrSamplesPerSec=2.6389078404692645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:56:36,573] [INFO] [logging.py:96:log_dist] [Rank 0] step=15830, skipped=0, lr=[9.388560101349878e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:56:36,593] [INFO] [timer.py:259:stop] epoch=0/micro_step=15830/global_step=15830, RunningAvgSamplesPerSec=2.634659437722128, CurrSamplesPerSec=2.6572000189575347, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:56:51,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=15840, skipped=0, lr=[9.387752485990504e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:56:51,849] [INFO] [timer.py:259:stop] epoch=0/micro_step=15840/global_step=15840, RunningAvgSamplesPerSec=2.6346598772519823, CurrSamplesPerSec=2.6476988870309426, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:57:06,998] [INFO] [logging.py:96:log_dist] [Rank 0] step=15850, skipped=0, lr=[9.38694437240042e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:57:07,014] [INFO] [timer.py:259:stop] epoch=0/micro_step=15850/global_step=15850, RunningAvgSamplesPerSec=2.6346690922916607, CurrSamplesPerSec=2.6505458115750042, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:57:22,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=15860, skipped=0, lr=[9.386135760671386e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:57:22,260] [INFO] [timer.py:259:stop] epoch=0/micro_step=15860/global_step=15860, RunningAvgSamplesPerSec=2.6346711983992606, CurrSamplesPerSec=2.6558249682273236, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:57:37,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=15870, skipped=0, lr=[9.385326650895222e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:57:37,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=15870/global_step=15870, RunningAvgSamplesPerSec=2.6346757923639723, CurrSamplesPerSec=2.64502527948605, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:57:52,685] [INFO] [logging.py:96:log_dist] [Rank 0] step=15880, skipped=0, lr=[9.3845170431638e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:57:52,686] [INFO] [timer.py:259:stop] epoch=0/micro_step=15880/global_step=15880, RunningAvgSamplesPerSec=2.6346810217400733, CurrSamplesPerSec=2.655844307493784, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:58:07,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=15890, skipped=0, lr=[9.383706937569054e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:58:07,834] [INFO] [timer.py:259:stop] epoch=0/micro_step=15890/global_step=15890, RunningAvgSamplesPerSec=2.63469250771866, CurrSamplesPerSec=2.658037355826688, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:58:23,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=15900, skipped=0, lr=[9.38289633420297e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:58:23,032] [INFO] [timer.py:259:stop] epoch=0/micro_step=15900/global_step=15900, RunningAvgSamplesPerSec=2.634697427229152, CurrSamplesPerSec=2.649344555052072, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:58:38,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=15910, skipped=0, lr=[9.382085233157594e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:58:38,187] [INFO] [timer.py:259:stop] epoch=0/micro_step=15910/global_step=15910, RunningAvgSamplesPerSec=2.634708163040026, CurrSamplesPerSec=2.6284698449396293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:58:53,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=15920, skipped=0, lr=[9.381273634525023e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:58:53,351] [INFO] [timer.py:259:stop] epoch=0/micro_step=15920/global_step=15920, RunningAvgSamplesPerSec=2.634716384253595, CurrSamplesPerSec=2.6373911915290704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:59:08,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=15930, skipped=0, lr=[9.380461538397419e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:59:08,543] [INFO] [timer.py:259:stop] epoch=0/micro_step=15930/global_step=15930, RunningAvgSamplesPerSec=2.634722743035266, CurrSamplesPerSec=2.6167159305780827, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:59:23,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=15940, skipped=0, lr=[9.379648944866993e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:59:23,681] [INFO] [timer.py:259:stop] epoch=0/micro_step=15940/global_step=15940, RunningAvgSamplesPerSec=2.6347350376866046, CurrSamplesPerSec=2.661237900367977, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:59:38,825] [INFO] [logging.py:96:log_dist] [Rank 0] step=15950, skipped=0, lr=[9.378835854026015e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:59:38,827] [INFO] [timer.py:259:stop] epoch=0/micro_step=15950/global_step=15950, RunningAvgSamplesPerSec=2.634744914228167, CurrSamplesPerSec=2.63445991487836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 06:59:53,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=15960, skipped=0, lr=[9.378022265966815e-06], mom=[(0.9, 0.95)] +[2024-11-01 06:59:53,982] [INFO] [timer.py:259:stop] epoch=0/micro_step=15960/global_step=15960, RunningAvgSamplesPerSec=2.63475498644021, CurrSamplesPerSec=2.6762157834567053, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:00:09,153] [INFO] [logging.py:96:log_dist] [Rank 0] step=15970, skipped=0, lr=[9.377208180781774e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:00:09,154] [INFO] [timer.py:259:stop] epoch=0/micro_step=15970/global_step=15970, RunningAvgSamplesPerSec=2.6347626386666367, CurrSamplesPerSec=2.633171108779524, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:00:24,339] [INFO] [logging.py:96:log_dist] [Rank 0] step=15980, skipped=0, lr=[9.37639359856333e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:00:24,340] [INFO] [timer.py:259:stop] epoch=0/micro_step=15980/global_step=15980, RunningAvgSamplesPerSec=2.6347713576232294, CurrSamplesPerSec=2.654124215007885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:00:39,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=15990, skipped=0, lr=[9.37557851940398e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:00:39,545] [INFO] [timer.py:259:stop] epoch=0/micro_step=15990/global_step=15990, RunningAvgSamplesPerSec=2.634775923025963, CurrSamplesPerSec=2.665335919378885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:00:54,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=16000, skipped=0, lr=[9.374762943396277e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:00:54,712] [INFO] [timer.py:259:stop] epoch=0/micro_step=16000/global_step=16000, RunningAvgSamplesPerSec=2.6347848487931347, CurrSamplesPerSec=2.6580091413202696, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:01:09,868] [INFO] [logging.py:96:log_dist] [Rank 0] step=16010, skipped=0, lr=[9.373946870632831e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:01:09,870] [INFO] [timer.py:259:stop] epoch=0/micro_step=16010/global_step=16010, RunningAvgSamplesPerSec=2.634794186170535, CurrSamplesPerSec=2.6708106891311165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:01:25,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=16020, skipped=0, lr=[9.373130301206306e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:01:25,048] [INFO] [timer.py:259:stop] epoch=0/micro_step=16020/global_step=16020, RunningAvgSamplesPerSec=2.6348027193594947, CurrSamplesPerSec=2.6488564112043127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:01:40,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=16030, skipped=0, lr=[9.372313235209425e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:01:40,266] [INFO] [timer.py:259:stop] epoch=0/micro_step=16030/global_step=16030, RunningAvgSamplesPerSec=2.634807371013858, CurrSamplesPerSec=2.6649205969960614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:01:55,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=16040, skipped=0, lr=[9.371495672734962e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:01:55,427] [INFO] [timer.py:259:stop] epoch=0/micro_step=16040/global_step=16040, RunningAvgSamplesPerSec=2.63481716263536, CurrSamplesPerSec=2.6490475480763456, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:02:10,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=16050, skipped=0, lr=[9.370677613875758e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:02:10,639] [INFO] [timer.py:259:stop] epoch=0/micro_step=16050/global_step=16050, RunningAvgSamplesPerSec=2.6348225491556674, CurrSamplesPerSec=2.663447479859138, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:02:25,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=16060, skipped=0, lr=[9.3698590587247e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:02:25,787] [INFO] [timer.py:259:stop] epoch=0/micro_step=16060/global_step=16060, RunningAvgSamplesPerSec=2.634833641467115, CurrSamplesPerSec=2.6635290890354795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:02:40,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=16070, skipped=0, lr=[9.369040007374734e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:02:40,958] [INFO] [timer.py:259:stop] epoch=0/micro_step=16070/global_step=16070, RunningAvgSamplesPerSec=2.6348421513573577, CurrSamplesPerSec=2.6378079302810344, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:02:56,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=16080, skipped=0, lr=[9.368220459918866e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:02:56,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=16080/global_step=16080, RunningAvgSamplesPerSec=2.6348511965498402, CurrSamplesPerSec=2.6352461392201856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:03:11,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=16090, skipped=0, lr=[9.367400416450155e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:03:11,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=16090/global_step=16090, RunningAvgSamplesPerSec=2.6348560280888966, CurrSamplesPerSec=2.645237968400665, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:03:26,510] [INFO] [logging.py:96:log_dist] [Rank 0] step=16100, skipped=0, lr=[9.366579877061717e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:03:26,513] [INFO] [timer.py:259:stop] epoch=0/micro_step=16100/global_step=16100, RunningAvgSamplesPerSec=2.63486836790425, CurrSamplesPerSec=2.6438423526681327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:03:41,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=16110, skipped=0, lr=[9.365758841846727e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:03:41,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=16110/global_step=16110, RunningAvgSamplesPerSec=2.6348783242519556, CurrSamplesPerSec=2.652145968223572, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:03:56,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=16120, skipped=0, lr=[9.36493731089841e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:03:56,849] [INFO] [timer.py:259:stop] epoch=0/micro_step=16120/global_step=16120, RunningAvgSamplesPerSec=2.634888624418727, CurrSamplesPerSec=2.6305816022316733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:04:12,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=16130, skipped=0, lr=[9.364115284310051e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:04:12,034] [INFO] [timer.py:259:stop] epoch=0/micro_step=16130/global_step=16130, RunningAvgSamplesPerSec=2.6348952480754653, CurrSamplesPerSec=2.662435180871433, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:04:27,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=16140, skipped=0, lr=[9.363292762174997e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:04:27,157] [INFO] [timer.py:259:stop] epoch=0/micro_step=16140/global_step=16140, RunningAvgSamplesPerSec=2.6349083731615868, CurrSamplesPerSec=2.658434106787535, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:04:42,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=16150, skipped=0, lr=[9.362469744586637e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:04:42,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=16150/global_step=16150, RunningAvgSamplesPerSec=2.6349202655607917, CurrSamplesPerSec=2.6537224530746393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:04:57,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=16160, skipped=0, lr=[9.361646231638433e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:04:57,427] [INFO] [timer.py:259:stop] epoch=0/micro_step=16160/global_step=16160, RunningAvgSamplesPerSec=2.6349328500906135, CurrSamplesPerSec=2.6585520598455252, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:05:12,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=16170, skipped=0, lr=[9.360822223423892e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:05:12,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=16170/global_step=16170, RunningAvgSamplesPerSec=2.634938713167465, CurrSamplesPerSec=2.6509700692314944, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:05:27,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=16180, skipped=0, lr=[9.35999772003658e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:05:27,699] [INFO] [timer.py:259:stop] epoch=0/micro_step=16180/global_step=16180, RunningAvgSamplesPerSec=2.6349546279841527, CurrSamplesPerSec=2.6581822278003853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:05:42,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=16190, skipped=0, lr=[9.359172721570117e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:05:42,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=16190/global_step=16190, RunningAvgSamplesPerSec=2.6349634824584456, CurrSamplesPerSec=2.633133501366793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:05:57,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=16200, skipped=0, lr=[9.358347228118187e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:05:57,971] [INFO] [timer.py:259:stop] epoch=0/micro_step=16200/global_step=16200, RunningAvgSamplesPerSec=2.634977852331532, CurrSamplesPerSec=2.6576638779700565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:06:13,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=16210, skipped=0, lr=[9.357521239774523e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:06:13,129] [INFO] [timer.py:259:stop] epoch=0/micro_step=16210/global_step=16210, RunningAvgSamplesPerSec=2.634987512944413, CurrSamplesPerSec=2.647796666840141, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:06:28,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=16220, skipped=0, lr=[9.356694756632915e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:06:28,284] [INFO] [timer.py:259:stop] epoch=0/micro_step=16220/global_step=16220, RunningAvgSamplesPerSec=2.634996233020321, CurrSamplesPerSec=2.669229129998413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:06:43,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=16230, skipped=0, lr=[9.355867778787212e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:06:43,471] [INFO] [timer.py:259:stop] epoch=0/micro_step=16230/global_step=16230, RunningAvgSamplesPerSec=2.6350024915438985, CurrSamplesPerSec=2.6468333910552926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:06:58,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=16240, skipped=0, lr=[9.355040306331317e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:06:58,604] [INFO] [timer.py:259:stop] epoch=0/micro_step=16240/global_step=16240, RunningAvgSamplesPerSec=2.6350136836865143, CurrSamplesPerSec=2.612054889108163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:07:13,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=16250, skipped=0, lr=[9.35421233935919e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:07:13,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=16250/global_step=16250, RunningAvgSamplesPerSec=2.6350317362598843, CurrSamplesPerSec=2.661269560588381, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:07:28,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=16260, skipped=0, lr=[9.353383877964847e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:07:28,818] [INFO] [timer.py:259:stop] epoch=0/micro_step=16260/global_step=16260, RunningAvgSamplesPerSec=2.635042548988494, CurrSamplesPerSec=2.6162752297491574, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:07:43,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=16270, skipped=0, lr=[9.352554922242358e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:07:43,889] [INFO] [timer.py:259:stop] epoch=0/micro_step=16270/global_step=16270, RunningAvgSamplesPerSec=2.635059949256874, CurrSamplesPerSec=2.648700008990497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:07:58,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=16280, skipped=0, lr=[9.351725472285855e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:07:58,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=16280/global_step=16280, RunningAvgSamplesPerSec=2.635073952360208, CurrSamplesPerSec=2.6265465012134546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:08:14,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=16290, skipped=0, lr=[9.350895528189519e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:08:14,128] [INFO] [timer.py:259:stop] epoch=0/micro_step=16290/global_step=16290, RunningAvgSamplesPerSec=2.63508623125621, CurrSamplesPerSec=2.6698106301950184, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:08:29,297] [INFO] [logging.py:96:log_dist] [Rank 0] step=16300, skipped=0, lr=[9.350065090047593e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:08:29,314] [INFO] [timer.py:259:stop] epoch=0/micro_step=16300/global_step=16300, RunningAvgSamplesPerSec=2.635092359040531, CurrSamplesPerSec=2.6171861751369505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:08:44,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=16310, skipped=0, lr=[9.349234157954372e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:08:44,428] [INFO] [timer.py:259:stop] epoch=0/micro_step=16310/global_step=16310, RunningAvgSamplesPerSec=2.6351063109402912, CurrSamplesPerSec=2.655428995614258, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:08:59,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=16320, skipped=0, lr=[9.348402732004207e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:08:59,589] [INFO] [timer.py:259:stop] epoch=0/micro_step=16320/global_step=16320, RunningAvgSamplesPerSec=2.6351178584981136, CurrSamplesPerSec=2.6614223845122953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:09:14,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=16330, skipped=0, lr=[9.34757081229151e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:09:14,758] [INFO] [timer.py:259:stop] epoch=0/micro_step=16330/global_step=16330, RunningAvgSamplesPerSec=2.6351266097974864, CurrSamplesPerSec=2.6499818806397153, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:09:29,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=16340, skipped=0, lr=[9.346738398910745e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:09:29,932] [INFO] [timer.py:259:stop] epoch=0/micro_step=16340/global_step=16340, RunningAvgSamplesPerSec=2.6351357345921236, CurrSamplesPerSec=2.6584012502995957, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:09:45,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=16350, skipped=0, lr=[9.34590549195643e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:09:45,085] [INFO] [timer.py:259:stop] epoch=0/micro_step=16350/global_step=16350, RunningAvgSamplesPerSec=2.635144982571798, CurrSamplesPerSec=2.6461717019314035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:10:00,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=16360, skipped=0, lr=[9.345072091523145e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:10:00,218] [INFO] [timer.py:259:stop] epoch=0/micro_step=16360/global_step=16360, RunningAvgSamplesPerSec=2.6351569860553576, CurrSamplesPerSec=2.653831592662869, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:10:15,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=16370, skipped=0, lr=[9.344238197705521e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:10:15,334] [INFO] [timer.py:259:stop] epoch=0/micro_step=16370/global_step=16370, RunningAvgSamplesPerSec=2.635169612649537, CurrSamplesPerSec=2.6591196446335856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:10:30,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=16380, skipped=0, lr=[9.343403810598248e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:10:30,559] [INFO] [timer.py:259:stop] epoch=0/micro_step=16380/global_step=16380, RunningAvgSamplesPerSec=2.6351724421805987, CurrSamplesPerSec=2.6462126043448477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:10:45,735] [INFO] [logging.py:96:log_dist] [Rank 0] step=16390, skipped=0, lr=[9.342568930296073e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:10:45,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=16390/global_step=16390, RunningAvgSamplesPerSec=2.635178762985785, CurrSamplesPerSec=2.647833440602912, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:11:00,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=16400, skipped=0, lr=[9.341733556893791e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:11:00,939] [INFO] [timer.py:259:stop] epoch=0/micro_step=16400/global_step=16400, RunningAvgSamplesPerSec=2.635184986732611, CurrSamplesPerSec=2.660108334660751, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:11:16,095] [INFO] [logging.py:96:log_dist] [Rank 0] step=16410, skipped=0, lr=[9.340897690486263e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:11:16,098] [INFO] [timer.py:259:stop] epoch=0/micro_step=16410/global_step=16410, RunningAvgSamplesPerSec=2.6351967271043786, CurrSamplesPerSec=2.6477866378093875, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:11:31,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=16420, skipped=0, lr=[9.340061331168402e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:11:31,287] [INFO] [timer.py:259:stop] epoch=0/micro_step=16420/global_step=16420, RunningAvgSamplesPerSec=2.6352067421851313, CurrSamplesPerSec=2.677264224376795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:11:46,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=16430, skipped=0, lr=[9.339224479035178e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:11:46,380] [INFO] [timer.py:259:stop] epoch=0/micro_step=16430/global_step=16430, RunningAvgSamplesPerSec=2.635221779858447, CurrSamplesPerSec=2.6575182205855272, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:12:01,533] [INFO] [logging.py:96:log_dist] [Rank 0] step=16440, skipped=0, lr=[9.338387134181613e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:12:01,534] [INFO] [timer.py:259:stop] epoch=0/micro_step=16440/global_step=16440, RunningAvgSamplesPerSec=2.6352307324974067, CurrSamplesPerSec=2.651584289772158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:12:16,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=16450, skipped=0, lr=[9.337549296702788e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:12:16,657] [INFO] [timer.py:259:stop] epoch=0/micro_step=16450/global_step=16450, RunningAvgSamplesPerSec=2.6352435408850057, CurrSamplesPerSec=2.6572235868352836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:12:31,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=16460, skipped=0, lr=[9.336710966693841e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:12:31,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=16460/global_step=16460, RunningAvgSamplesPerSec=2.6352615807657336, CurrSamplesPerSec=2.6715489945804327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:12:46,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=16470, skipped=0, lr=[9.335872144249965e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:12:46,782] [INFO] [timer.py:259:stop] epoch=0/micro_step=16470/global_step=16470, RunningAvgSamplesPerSec=2.6352808452173524, CurrSamplesPerSec=2.681398502929338, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:13:01,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=16480, skipped=0, lr=[9.335032829466406e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:13:01,917] [INFO] [timer.py:259:stop] epoch=0/micro_step=16480/global_step=16480, RunningAvgSamplesPerSec=2.6352935018605717, CurrSamplesPerSec=2.6616242065299573, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:13:17,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=16490, skipped=0, lr=[9.33419302243847e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:13:17,028] [INFO] [timer.py:259:stop] epoch=0/micro_step=16490/global_step=16490, RunningAvgSamplesPerSec=2.6353083584355743, CurrSamplesPerSec=2.6581228452269112, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:13:32,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=16500, skipped=0, lr=[9.333352723261521e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:13:32,164] [INFO] [timer.py:259:stop] epoch=0/micro_step=16500/global_step=16500, RunningAvgSamplesPerSec=2.6353186958894774, CurrSamplesPerSec=2.670559435157689, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:13:47,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=16510, skipped=0, lr=[9.33251193203097e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:13:47,231] [INFO] [timer.py:259:stop] epoch=0/micro_step=16510/global_step=16510, RunningAvgSamplesPerSec=2.6353380639277963, CurrSamplesPerSec=2.6824908965293695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:14:02,369] [INFO] [logging.py:96:log_dist] [Rank 0] step=16520, skipped=0, lr=[9.33167064884229e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:14:02,374] [INFO] [timer.py:259:stop] epoch=0/micro_step=16520/global_step=16520, RunningAvgSamplesPerSec=2.635348272824464, CurrSamplesPerSec=2.6551243200216503, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:14:17,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=16530, skipped=0, lr=[9.330828873791012e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:14:17,546] [INFO] [timer.py:259:stop] epoch=0/micro_step=16530/global_step=16530, RunningAvgSamplesPerSec=2.63535472629514, CurrSamplesPerSec=2.666041968046265, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:14:32,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=16540, skipped=0, lr=[9.329986606972716e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:14:32,726] [INFO] [timer.py:259:stop] epoch=0/micro_step=16540/global_step=16540, RunningAvgSamplesPerSec=2.635362369624521, CurrSamplesPerSec=2.646899786966002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:14:47,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=16550, skipped=0, lr=[9.329143848483045e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:14:47,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=16550/global_step=16550, RunningAvgSamplesPerSec=2.635375662590832, CurrSamplesPerSec=2.654852482351169, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:15:02,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=16560, skipped=0, lr=[9.32830059841769e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:15:02,911] [INFO] [timer.py:259:stop] epoch=0/micro_step=16560/global_step=16560, RunningAvgSamplesPerSec=2.6353942245838673, CurrSamplesPerSec=2.6818931423732453, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:15:18,075] [INFO] [logging.py:96:log_dist] [Rank 0] step=16570, skipped=0, lr=[9.327456856872407e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:15:18,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=16570/global_step=16570, RunningAvgSamplesPerSec=2.635402480082981, CurrSamplesPerSec=2.641601530734066, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:15:33,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=16580, skipped=0, lr=[9.326612623943001e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:15:33,208] [INFO] [timer.py:259:stop] epoch=0/micro_step=16580/global_step=16580, RunningAvgSamplesPerSec=2.635414629099551, CurrSamplesPerSec=2.653883227211512, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:15:48,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=16590, skipped=0, lr=[9.325767899725337e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:15:48,364] [INFO] [timer.py:259:stop] epoch=0/micro_step=16590/global_step=16590, RunningAvgSamplesPerSec=2.6354230365695606, CurrSamplesPerSec=2.6164559807423293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:16:03,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=16600, skipped=0, lr=[9.32492268431533e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:16:03,472] [INFO] [timer.py:259:stop] epoch=0/micro_step=16600/global_step=16600, RunningAvgSamplesPerSec=2.635436503717197, CurrSamplesPerSec=2.660291818561241, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:16:18,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=16610, skipped=0, lr=[9.324076977808958e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:16:18,635] [INFO] [timer.py:259:stop] epoch=0/micro_step=16610/global_step=16610, RunningAvgSamplesPerSec=2.635443982057825, CurrSamplesPerSec=2.5943905620106644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:16:33,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=16620, skipped=0, lr=[9.323230780302248e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:16:33,788] [INFO] [timer.py:259:stop] epoch=0/micro_step=16620/global_step=16620, RunningAvgSamplesPerSec=2.6354539489617035, CurrSamplesPerSec=2.652569060788099, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:16:48,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=16630, skipped=0, lr=[9.32238409189129e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:16:48,925] [INFO] [timer.py:259:stop] epoch=0/micro_step=16630/global_step=16630, RunningAvgSamplesPerSec=2.635464465414429, CurrSamplesPerSec=2.6299638796473475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:17:04,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=16640, skipped=0, lr=[9.321536912672221e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:17:04,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=16640/global_step=16640, RunningAvgSamplesPerSec=2.635473880320355, CurrSamplesPerSec=2.6322200952289303, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:17:19,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=16650, skipped=0, lr=[9.320689242741242e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:17:19,222] [INFO] [timer.py:259:stop] epoch=0/micro_step=16650/global_step=16650, RunningAvgSamplesPerSec=2.635484842204458, CurrSamplesPerSec=2.6372100238126372, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:17:34,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=16660, skipped=0, lr=[9.319841082194606e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:17:34,350] [INFO] [timer.py:259:stop] epoch=0/micro_step=16660/global_step=16660, RunningAvgSamplesPerSec=2.635497005643473, CurrSamplesPerSec=2.6573776300463545, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:17:49,573] [INFO] [logging.py:96:log_dist] [Rank 0] step=16670, skipped=0, lr=[9.31899243112862e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:17:49,573] [INFO] [timer.py:259:stop] epoch=0/micro_step=16670/global_step=16670, RunningAvgSamplesPerSec=2.6354985641441147, CurrSamplesPerSec=2.6694780099556783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:18:04,693] [INFO] [logging.py:96:log_dist] [Rank 0] step=16680, skipped=0, lr=[9.31814328963965e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:18:04,712] [INFO] [timer.py:259:stop] epoch=0/micro_step=16680/global_step=16680, RunningAvgSamplesPerSec=2.635509635840639, CurrSamplesPerSec=2.661820991354553, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:18:19,840] [INFO] [logging.py:96:log_dist] [Rank 0] step=16690, skipped=0, lr=[9.317293657824116e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:18:19,850] [INFO] [timer.py:259:stop] epoch=0/micro_step=16690/global_step=16690, RunningAvgSamplesPerSec=2.6355223169180024, CurrSamplesPerSec=2.6665754585504744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:18:34,972] [INFO] [logging.py:96:log_dist] [Rank 0] step=16700, skipped=0, lr=[9.316443535778494e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:18:34,973] [INFO] [timer.py:259:stop] epoch=0/micro_step=16700/global_step=16700, RunningAvgSamplesPerSec=2.6355341943930743, CurrSamplesPerSec=2.6838190009966385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:18:50,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=16710, skipped=0, lr=[9.315592923599314e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:18:50,089] [INFO] [timer.py:259:stop] epoch=0/micro_step=16710/global_step=16710, RunningAvgSamplesPerSec=2.6355472059025717, CurrSamplesPerSec=2.6438294371811164, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:19:05,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=16720, skipped=0, lr=[9.314741821383166e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:19:05,200] [INFO] [timer.py:259:stop] epoch=0/micro_step=16720/global_step=16720, RunningAvgSamplesPerSec=2.6355621613977376, CurrSamplesPerSec=2.669506468470524, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:19:20,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=16730, skipped=0, lr=[9.313890229226692e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:19:20,304] [INFO] [timer.py:259:stop] epoch=0/micro_step=16730/global_step=16730, RunningAvgSamplesPerSec=2.635575957481616, CurrSamplesPerSec=2.6732504361605733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:19:35,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=16740, skipped=0, lr=[9.31303814722659e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:19:35,446] [INFO] [timer.py:259:stop] epoch=0/micro_step=16740/global_step=16740, RunningAvgSamplesPerSec=2.6355873317351977, CurrSamplesPerSec=2.658073151211756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:19:50,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=16750, skipped=0, lr=[9.312185575479615e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:19:50,651] [INFO] [timer.py:259:stop] epoch=0/micro_step=16750/global_step=16750, RunningAvgSamplesPerSec=2.6355937850626803, CurrSamplesPerSec=2.6468972813994585, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:20:05,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=16760, skipped=0, lr=[9.311332514082576e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:20:05,757] [INFO] [timer.py:259:stop] epoch=0/micro_step=16760/global_step=16760, RunningAvgSamplesPerSec=2.635609381345456, CurrSamplesPerSec=2.6801070430373115, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:20:20,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=16770, skipped=0, lr=[9.31047896313234e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:20:20,861] [INFO] [timer.py:259:stop] epoch=0/micro_step=16770/global_step=16770, RunningAvgSamplesPerSec=2.6356228431830915, CurrSamplesPerSec=2.6617179501778803, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:20:35,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=16780, skipped=0, lr=[9.309624922725824e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:20:35,990] [INFO] [timer.py:259:stop] epoch=0/micro_step=16780/global_step=16780, RunningAvgSamplesPerSec=2.635634291967127, CurrSamplesPerSec=2.660207455003804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:20:51,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=16790, skipped=0, lr=[9.30877039296001e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:20:51,096] [INFO] [timer.py:259:stop] epoch=0/micro_step=16790/global_step=16790, RunningAvgSamplesPerSec=2.635648663950151, CurrSamplesPerSec=2.661310508923964, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:21:06,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=16800, skipped=0, lr=[9.307915373931924e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:21:06,193] [INFO] [timer.py:259:stop] epoch=0/micro_step=16800/global_step=16800, RunningAvgSamplesPerSec=2.635663684875581, CurrSamplesPerSec=2.655193233622879, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:21:21,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=16810, skipped=0, lr=[9.30705986573866e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:21:21,278] [INFO] [timer.py:259:stop] epoch=0/micro_step=16810/global_step=16810, RunningAvgSamplesPerSec=2.635680167418788, CurrSamplesPerSec=2.6690894208061655, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:21:36,359] [INFO] [logging.py:96:log_dist] [Rank 0] step=16820, skipped=0, lr=[9.306203868477357e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:21:36,360] [INFO] [timer.py:259:stop] epoch=0/micro_step=16820/global_step=16820, RunningAvgSamplesPerSec=2.6356956041637205, CurrSamplesPerSec=2.682324493433549, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:21:51,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=16830, skipped=0, lr=[9.305347382245216e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:21:51,479] [INFO] [timer.py:259:stop] epoch=0/micro_step=16830/global_step=16830, RunningAvgSamplesPerSec=2.6357083194429602, CurrSamplesPerSec=2.6610986043469547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:22:06,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=16840, skipped=0, lr=[9.304490407139491e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:22:06,600] [INFO] [timer.py:259:stop] epoch=0/micro_step=16840/global_step=16840, RunningAvgSamplesPerSec=2.635721942717443, CurrSamplesPerSec=2.6797367543155923, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:22:21,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=16850, skipped=0, lr=[9.303632943257489e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:22:21,753] [INFO] [timer.py:259:stop] epoch=0/micro_step=16850/global_step=16850, RunningAvgSamplesPerSec=2.6357305280513783, CurrSamplesPerSec=2.6579619781097388, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:22:36,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=16860, skipped=0, lr=[9.302774990696578e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:22:36,885] [INFO] [timer.py:259:stop] epoch=0/micro_step=16860/global_step=16860, RunningAvgSamplesPerSec=2.6357411263524453, CurrSamplesPerSec=2.6549621350530637, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:22:52,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=16870, skipped=0, lr=[9.301916549554179e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:22:52,049] [INFO] [timer.py:259:stop] epoch=0/micro_step=16870/global_step=16870, RunningAvgSamplesPerSec=2.635749411191545, CurrSamplesPerSec=2.660093150920217, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:23:07,228] [INFO] [logging.py:96:log_dist] [Rank 0] step=16880, skipped=0, lr=[9.301057619927766e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:23:07,237] [INFO] [timer.py:259:stop] epoch=0/micro_step=16880/global_step=16880, RunningAvgSamplesPerSec=2.635754131254861, CurrSamplesPerSec=2.642389110947296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:23:22,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=16890, skipped=0, lr=[9.300198201914874e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:23:22,517] [INFO] [timer.py:259:stop] epoch=0/micro_step=16890/global_step=16890, RunningAvgSamplesPerSec=2.6357511681195485, CurrSamplesPerSec=2.6579514508358812, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:23:37,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=16900, skipped=0, lr=[9.299338295613087e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:23:37,765] [INFO] [timer.py:259:stop] epoch=0/micro_step=16900/global_step=16900, RunningAvgSamplesPerSec=2.6357497306441786, CurrSamplesPerSec=2.6395443068065423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:23:52,938] [INFO] [logging.py:96:log_dist] [Rank 0] step=16910, skipped=0, lr=[9.298477901120051e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:23:52,940] [INFO] [timer.py:259:stop] epoch=0/micro_step=16910/global_step=16910, RunningAvgSamplesPerSec=2.6357573131210104, CurrSamplesPerSec=2.655025998271546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:24:08,088] [INFO] [logging.py:96:log_dist] [Rank 0] step=16920, skipped=0, lr=[9.29761701853346e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:24:08,089] [INFO] [timer.py:259:stop] epoch=0/micro_step=16920/global_step=16920, RunningAvgSamplesPerSec=2.635766535813171, CurrSamplesPerSec=2.642793276502487, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:24:23,216] [INFO] [logging.py:96:log_dist] [Rank 0] step=16930, skipped=0, lr=[9.296755647951071e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:24:23,217] [INFO] [timer.py:259:stop] epoch=0/micro_step=16930/global_step=16930, RunningAvgSamplesPerSec=2.6357789391566455, CurrSamplesPerSec=2.6726384850372167, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:24:38,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=16940, skipped=0, lr=[9.295893789470692e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:24:38,362] [INFO] [timer.py:259:stop] epoch=0/micro_step=16940/global_step=16940, RunningAvgSamplesPerSec=2.6357893912655945, CurrSamplesPerSec=2.625986980856575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:24:53,506] [INFO] [logging.py:96:log_dist] [Rank 0] step=16950, skipped=0, lr=[9.295031443190186e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:24:53,507] [INFO] [timer.py:259:stop] epoch=0/micro_step=16950/global_step=16950, RunningAvgSamplesPerSec=2.6358001440434258, CurrSamplesPerSec=2.6720621360435386, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:25:08,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=16960, skipped=0, lr=[9.294168609207474e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:25:08,598] [INFO] [timer.py:259:stop] epoch=0/micro_step=16960/global_step=16960, RunningAvgSamplesPerSec=2.635814569184378, CurrSamplesPerSec=2.6094986104935094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:25:23,665] [INFO] [logging.py:96:log_dist] [Rank 0] step=16970, skipped=0, lr=[9.29330528762053e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:25:23,684] [INFO] [timer.py:259:stop] epoch=0/micro_step=16970/global_step=16970, RunningAvgSamplesPerSec=2.6358302294719604, CurrSamplesPerSec=2.6624094079398044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:25:38,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=16980, skipped=0, lr=[9.292441478527386e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:25:38,751] [INFO] [timer.py:259:stop] epoch=0/micro_step=16980/global_step=16980, RunningAvgSamplesPerSec=2.635847077839279, CurrSamplesPerSec=2.642432809809295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:25:53,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=16990, skipped=0, lr=[9.291577182026126e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:25:53,842] [INFO] [timer.py:259:stop] epoch=0/micro_step=16990/global_step=16990, RunningAvgSamplesPerSec=2.635862582246528, CurrSamplesPerSec=2.658602193064771, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:26:09,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=17000, skipped=0, lr=[9.290712398214892e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:26:09,003] [INFO] [timer.py:259:stop] epoch=0/micro_step=17000/global_step=17000, RunningAvgSamplesPerSec=2.635872217875927, CurrSamplesPerSec=2.6170392055575946, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:26:24,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=17010, skipped=0, lr=[9.28984712719188e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:26:24,084] [INFO] [timer.py:259:stop] epoch=0/micro_step=17010/global_step=17010, RunningAvgSamplesPerSec=2.635889054106504, CurrSamplesPerSec=2.679039690612475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:26:39,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=17020, skipped=0, lr=[9.288981369055343e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:26:39,238] [INFO] [timer.py:259:stop] epoch=0/micro_step=17020/global_step=17020, RunningAvgSamplesPerSec=2.635898789785651, CurrSamplesPerSec=2.6534752431296953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:26:54,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=17030, skipped=0, lr=[9.288115123903588e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:26:54,421] [INFO] [timer.py:259:stop] epoch=0/micro_step=17030/global_step=17030, RunningAvgSamplesPerSec=2.6359049048573144, CurrSamplesPerSec=2.630012528176302, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:27:09,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=17040, skipped=0, lr=[9.287248391834976e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:27:09,673] [INFO] [timer.py:259:stop] epoch=0/micro_step=17040/global_step=17040, RunningAvgSamplesPerSec=2.635904458417855, CurrSamplesPerSec=2.653555402938181, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:27:24,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=17050, skipped=0, lr=[9.286381172947927e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:27:24,823] [INFO] [timer.py:259:stop] epoch=0/micro_step=17050/global_step=17050, RunningAvgSamplesPerSec=2.635914027948461, CurrSamplesPerSec=2.654835258037256, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:27:39,901] [INFO] [logging.py:96:log_dist] [Rank 0] step=17060, skipped=0, lr=[9.285513467340911e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:27:39,902] [INFO] [timer.py:259:stop] epoch=0/micro_step=17060/global_step=17060, RunningAvgSamplesPerSec=2.635930473385808, CurrSamplesPerSec=2.675229163146106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:27:54,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=17070, skipped=0, lr=[9.284645275112458e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:27:54,969] [INFO] [timer.py:259:stop] epoch=0/micro_step=17070/global_step=17070, RunningAvgSamplesPerSec=2.635947745884066, CurrSamplesPerSec=2.6639401708643553, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:28:10,105] [INFO] [logging.py:96:log_dist] [Rank 0] step=17080, skipped=0, lr=[9.283776596361151e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:28:10,106] [INFO] [timer.py:259:stop] epoch=0/micro_step=17080/global_step=17080, RunningAvgSamplesPerSec=2.635957234842603, CurrSamplesPerSec=2.648933364527146, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:28:25,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=17090, skipped=0, lr=[9.28290743118563e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:28:25,225] [INFO] [timer.py:259:stop] epoch=0/micro_step=17090/global_step=17090, RunningAvgSamplesPerSec=2.6359692819005125, CurrSamplesPerSec=2.660172445698479, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:28:40,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=17100, skipped=0, lr=[9.282037779684587e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:28:40,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=17100/global_step=17100, RunningAvgSamplesPerSec=2.6359840229125218, CurrSamplesPerSec=2.653749317371165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:28:55,462] [INFO] [logging.py:96:log_dist] [Rank 0] step=17110, skipped=0, lr=[9.281167641956774e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:28:55,463] [INFO] [timer.py:259:stop] epoch=0/micro_step=17110/global_step=17110, RunningAvgSamplesPerSec=2.6359934584062183, CurrSamplesPerSec=2.674323836957157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:29:10,628] [INFO] [logging.py:96:log_dist] [Rank 0] step=17120, skipped=0, lr=[9.280297018100992e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:29:10,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=17120/global_step=17120, RunningAvgSamplesPerSec=2.636001443388248, CurrSamplesPerSec=2.6650260029578736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:29:25,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=17130, skipped=0, lr=[9.279425908216104e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:29:25,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=17130/global_step=17130, RunningAvgSamplesPerSec=2.636011774699878, CurrSamplesPerSec=2.6587644017288707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:29:40,893] [INFO] [logging.py:96:log_dist] [Rank 0] step=17140, skipped=0, lr=[9.278554312401022e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:29:40,895] [INFO] [timer.py:259:stop] epoch=0/micro_step=17140/global_step=17140, RunningAvgSamplesPerSec=2.6360240611372854, CurrSamplesPerSec=2.668595247916149, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:29:56,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=17150, skipped=0, lr=[9.277682230754716e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:29:56,018] [INFO] [timer.py:259:stop] epoch=0/micro_step=17150/global_step=17150, RunningAvgSamplesPerSec=2.6360369495482625, CurrSamplesPerSec=2.655305435980538, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:30:11,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=17160, skipped=0, lr=[9.276809663376215e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:30:11,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=17160/global_step=17160, RunningAvgSamplesPerSec=2.636054282232898, CurrSamplesPerSec=2.662682372851712, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:30:26,154] [INFO] [logging.py:96:log_dist] [Rank 0] step=17170, skipped=0, lr=[9.275936610364593e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:30:26,169] [INFO] [timer.py:259:stop] epoch=0/micro_step=17170/global_step=17170, RunningAvgSamplesPerSec=2.6360699582860474, CurrSamplesPerSec=2.632437751422555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:30:41,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=17180, skipped=0, lr=[9.275063071818991e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:30:41,315] [INFO] [timer.py:259:stop] epoch=0/micro_step=17180/global_step=17180, RunningAvgSamplesPerSec=2.6360794902711118, CurrSamplesPerSec=2.651618235221132, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:30:56,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=17190, skipped=0, lr=[9.274189047838598e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:30:56,440] [INFO] [timer.py:259:stop] epoch=0/micro_step=17190/global_step=17190, RunningAvgSamplesPerSec=2.6360910286431234, CurrSamplesPerSec=2.6644829762863425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:31:11,541] [INFO] [logging.py:96:log_dist] [Rank 0] step=17200, skipped=0, lr=[9.273314538522658e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:31:11,551] [INFO] [timer.py:259:stop] epoch=0/micro_step=17200/global_step=17200, RunningAvgSamplesPerSec=2.636102944025475, CurrSamplesPerSec=2.63719510035068, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:31:26,699] [INFO] [logging.py:96:log_dist] [Rank 0] step=17210, skipped=0, lr=[9.272439543970474e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:31:26,707] [INFO] [timer.py:259:stop] epoch=0/micro_step=17210/global_step=17210, RunningAvgSamplesPerSec=2.6361108846927475, CurrSamplesPerSec=2.6513789592900685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:31:41,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=17220, skipped=0, lr=[9.271564064281399e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:31:41,893] [INFO] [timer.py:259:stop] epoch=0/micro_step=17220/global_step=17220, RunningAvgSamplesPerSec=2.6361179656045897, CurrSamplesPerSec=2.659067384681067, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:31:57,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=17230, skipped=0, lr=[9.27068809955485e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:31:57,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=17230/global_step=17230, RunningAvgSamplesPerSec=2.636129124124684, CurrSamplesPerSec=2.655399995892019, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:32:12,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=17240, skipped=0, lr=[9.269811649890285e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:32:12,149] [INFO] [timer.py:259:stop] epoch=0/micro_step=17240/global_step=17240, RunningAvgSamplesPerSec=2.63614043604724, CurrSamplesPerSec=2.671905109144759, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:32:27,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=17250, skipped=0, lr=[9.268934715387232e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:32:27,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=17250/global_step=17250, RunningAvgSamplesPerSec=2.6361457369349584, CurrSamplesPerSec=2.6490751543680444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:32:42,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=17260, skipped=0, lr=[9.268057296145262e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:32:42,485] [INFO] [timer.py:259:stop] epoch=0/micro_step=17260/global_step=17260, RunningAvgSamplesPerSec=2.636156229642195, CurrSamplesPerSec=2.6532700393025928, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:32:57,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=17270, skipped=0, lr=[9.267179392264011e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:32:57,615] [INFO] [timer.py:259:stop] epoch=0/micro_step=17270/global_step=17270, RunningAvgSamplesPerSec=2.636167468834908, CurrSamplesPerSec=2.656850337611132, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:33:12,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=17280, skipped=0, lr=[9.266301003843164e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:33:12,703] [INFO] [timer.py:259:stop] epoch=0/micro_step=17280/global_step=17280, RunningAvgSamplesPerSec=2.636181411632034, CurrSamplesPerSec=2.662337584369827, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:33:27,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=17290, skipped=0, lr=[9.265422130982457e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:33:27,876] [INFO] [timer.py:259:stop] epoch=0/micro_step=17290/global_step=17290, RunningAvgSamplesPerSec=2.6361870613956473, CurrSamplesPerSec=2.6653431177478355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:33:43,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=17300, skipped=0, lr=[9.264542773781695e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:33:43,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=17300/global_step=17300, RunningAvgSamplesPerSec=2.636197392651674, CurrSamplesPerSec=2.6457222760466066, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:33:58,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=17310, skipped=0, lr=[9.263662932340722e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:33:58,138] [INFO] [timer.py:259:stop] epoch=0/micro_step=17310/global_step=17310, RunningAvgSamplesPerSec=2.636207583483723, CurrSamplesPerSec=2.612187471040704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:34:13,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=17320, skipped=0, lr=[9.262782606759447e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:34:13,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=17320/global_step=17320, RunningAvgSamplesPerSec=2.6362213854898022, CurrSamplesPerSec=2.6773432644116593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:34:28,376] [INFO] [logging.py:96:log_dist] [Rank 0] step=17330, skipped=0, lr=[9.261901797137832e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:34:28,378] [INFO] [timer.py:259:stop] epoch=0/micro_step=17330/global_step=17330, RunningAvgSamplesPerSec=2.6362318395578743, CurrSamplesPerSec=2.607393403687608, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:34:43,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=17340, skipped=0, lr=[9.261020503575893e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:34:43,440] [INFO] [timer.py:259:stop] epoch=0/micro_step=17340/global_step=17340, RunningAvgSamplesPerSec=2.6362486674120884, CurrSamplesPerSec=2.6787398380211296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:34:58,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=17350, skipped=0, lr=[9.260138726173699e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:34:58,527] [INFO] [timer.py:259:stop] epoch=0/micro_step=17350/global_step=17350, RunningAvgSamplesPerSec=2.63626289139098, CurrSamplesPerSec=2.6617179501778803, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:35:13,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=17360, skipped=0, lr=[9.25925646503138e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:35:13,626] [INFO] [timer.py:259:stop] epoch=0/micro_step=17360/global_step=17360, RunningAvgSamplesPerSec=2.636276961981645, CurrSamplesPerSec=2.680792668287009, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:35:28,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=17370, skipped=0, lr=[9.258373720249114e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:35:28,764] [INFO] [timer.py:259:stop] epoch=0/micro_step=17370/global_step=17370, RunningAvgSamplesPerSec=2.6362865880055733, CurrSamplesPerSec=2.63398633776762, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:35:43,891] [INFO] [logging.py:96:log_dist] [Rank 0] step=17380, skipped=0, lr=[9.257490491927139e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:35:43,898] [INFO] [timer.py:259:stop] epoch=0/micro_step=17380/global_step=17380, RunningAvgSamplesPerSec=2.6362974085064437, CurrSamplesPerSec=2.6488856863980224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:35:59,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=17390, skipped=0, lr=[9.256606780165742e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:35:59,072] [INFO] [timer.py:259:stop] epoch=0/micro_step=17390/global_step=17390, RunningAvgSamplesPerSec=2.6363028150632393, CurrSamplesPerSec=2.634506247721889, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:36:14,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=17400, skipped=0, lr=[9.255722585065274e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:36:14,199] [INFO] [timer.py:259:stop] epoch=0/micro_step=17400/global_step=17400, RunningAvgSamplesPerSec=2.636313540522371, CurrSamplesPerSec=2.646477665933344, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:36:29,339] [INFO] [logging.py:96:log_dist] [Rank 0] step=17410, skipped=0, lr=[9.254837906726132e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:36:29,340] [INFO] [timer.py:259:stop] epoch=0/micro_step=17410/global_step=17410, RunningAvgSamplesPerSec=2.6363225895974742, CurrSamplesPerSec=2.652873989160938, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:36:44,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=17420, skipped=0, lr=[9.253952745248773e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:36:44,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=17420/global_step=17420, RunningAvgSamplesPerSec=2.636333638912197, CurrSamplesPerSec=2.6729791330637176, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:36:59,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=17430, skipped=0, lr=[9.253067100733706e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:36:59,615] [INFO] [timer.py:259:stop] epoch=0/micro_step=17430/global_step=17430, RunningAvgSamplesPerSec=2.6363420081726634, CurrSamplesPerSec=2.6552356760962836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:37:14,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=17440, skipped=0, lr=[9.252180973281498e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:37:14,767] [INFO] [timer.py:259:stop] epoch=0/micro_step=17440/global_step=17440, RunningAvgSamplesPerSec=2.636349666085041, CurrSamplesPerSec=2.658818756591634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:37:29,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=17450, skipped=0, lr=[9.251294362992769e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:37:29,888] [INFO] [timer.py:259:stop] epoch=0/micro_step=17450/global_step=17450, RunningAvgSamplesPerSec=2.636360280675966, CurrSamplesPerSec=2.6433933011123174, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:37:45,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=17460, skipped=0, lr=[9.250407269968192e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:37:45,005] [INFO] [timer.py:259:stop] epoch=0/micro_step=17460/global_step=17460, RunningAvgSamplesPerSec=2.636371025429992, CurrSamplesPerSec=2.634167062826174, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:38:00,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=17470, skipped=0, lr=[9.249519694308498e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:38:00,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=17470/global_step=17470, RunningAvgSamplesPerSec=2.636370612111388, CurrSamplesPerSec=2.6506814922435713, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:38:15,425] [INFO] [logging.py:96:log_dist] [Rank 0] step=17480, skipped=0, lr=[9.24863163611447e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:38:15,426] [INFO] [timer.py:259:stop] epoch=0/micro_step=17480/global_step=17480, RunningAvgSamplesPerSec=2.6363765413901903, CurrSamplesPerSec=2.646228047421606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:38:30,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=17490, skipped=0, lr=[9.247743095486951e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:38:30,670] [INFO] [timer.py:259:stop] epoch=0/micro_step=17490/global_step=17490, RunningAvgSamplesPerSec=2.6363770819918155, CurrSamplesPerSec=2.6474189591385446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:38:45,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=17500, skipped=0, lr=[9.246854072526832e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:38:45,783] [INFO] [timer.py:259:stop] epoch=0/micro_step=17500/global_step=17500, RunningAvgSamplesPerSec=2.6363906979216134, CurrSamplesPerSec=2.658776620845458, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:39:00,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=17510, skipped=0, lr=[9.245964567335062e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:39:01,002] [INFO] [timer.py:259:stop] epoch=0/micro_step=17510/global_step=17510, RunningAvgSamplesPerSec=2.6363927025658587, CurrSamplesPerSec=2.6368709712706915, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:39:16,136] [INFO] [logging.py:96:log_dist] [Rank 0] step=17520, skipped=0, lr=[9.245074580012646e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:39:16,155] [INFO] [timer.py:259:stop] epoch=0/micro_step=17520/global_step=17520, RunningAvgSamplesPerSec=2.636400372041617, CurrSamplesPerSec=2.657033793122035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:39:31,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=17530, skipped=0, lr=[9.24418411066064e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:39:31,229] [INFO] [timer.py:259:stop] epoch=0/micro_step=17530/global_step=17530, RunningAvgSamplesPerSec=2.6364166649938743, CurrSamplesPerSec=2.681774395117227, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:39:46,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=17540, skipped=0, lr=[9.24329315938016e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:39:46,310] [INFO] [timer.py:259:stop] epoch=0/micro_step=17540/global_step=17540, RunningAvgSamplesPerSec=2.6364309338412606, CurrSamplesPerSec=2.6601998625460856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:40:01,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=17550, skipped=0, lr=[9.242401726272372e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:40:01,446] [INFO] [timer.py:259:stop] epoch=0/micro_step=17550/global_step=17550, RunningAvgSamplesPerSec=2.6364411474199163, CurrSamplesPerSec=2.6516777466241854, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:40:16,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=17560, skipped=0, lr=[9.241509811438498e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:40:16,574] [INFO] [timer.py:259:stop] epoch=0/micro_step=17560/global_step=17560, RunningAvgSamplesPerSec=2.6364514374578083, CurrSamplesPerSec=2.658099682531026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:40:31,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=17570, skipped=0, lr=[9.240617414979817e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:40:31,663] [INFO] [timer.py:259:stop] epoch=0/micro_step=17570/global_step=17570, RunningAvgSamplesPerSec=2.636465402068663, CurrSamplesPerSec=2.6595926063150475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:40:46,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=17580, skipped=0, lr=[9.239724536997662e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:40:46,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=17580/global_step=17580, RunningAvgSamplesPerSec=2.636477608740258, CurrSamplesPerSec=2.652174477653364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:41:01,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=17590, skipped=0, lr=[9.238831177593414e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:41:01,881] [INFO] [timer.py:259:stop] epoch=0/micro_step=17590/global_step=17590, RunningAvgSamplesPerSec=2.63648931484334, CurrSamplesPerSec=2.649703143744207, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:41:17,035] [INFO] [logging.py:96:log_dist] [Rank 0] step=17600, skipped=0, lr=[9.237937336868518e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:41:17,036] [INFO] [timer.py:259:stop] epoch=0/micro_step=17600/global_step=17600, RunningAvgSamplesPerSec=2.636497326452262, CurrSamplesPerSec=2.6742922916550005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:41:32,194] [INFO] [logging.py:96:log_dist] [Rank 0] step=17610, skipped=0, lr=[9.237043014924472e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:41:32,195] [INFO] [timer.py:259:stop] epoch=0/micro_step=17610/global_step=17610, RunningAvgSamplesPerSec=2.636505295902701, CurrSamplesPerSec=2.658112316678739, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:41:47,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=17620, skipped=0, lr=[9.236148211862823e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:41:47,398] [INFO] [timer.py:259:stop] epoch=0/micro_step=17620/global_step=17620, RunningAvgSamplesPerSec=2.636508650095185, CurrSamplesPerSec=2.653752255686596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:42:02,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=17630, skipped=0, lr=[9.235252927785179e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:42:02,567] [INFO] [timer.py:259:stop] epoch=0/micro_step=17630/global_step=17630, RunningAvgSamplesPerSec=2.6365152451829177, CurrSamplesPerSec=2.654647065462224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:42:17,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=17640, skipped=0, lr=[9.234357162793197e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:42:17,950] [INFO] [timer.py:259:stop] epoch=0/micro_step=17640/global_step=17640, RunningAvgSamplesPerSec=2.6365016342866814, CurrSamplesPerSec=2.546090367410289, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:42:33,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=17650, skipped=0, lr=[9.233460916988593e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:42:33,197] [INFO] [timer.py:259:stop] epoch=0/micro_step=17650/global_step=17650, RunningAvgSamplesPerSec=2.6365042409521955, CurrSamplesPerSec=2.6418107574320113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:42:48,524] [INFO] [logging.py:96:log_dist] [Rank 0] step=17660, skipped=0, lr=[9.232564190473134e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:42:48,527] [INFO] [timer.py:259:stop] epoch=0/micro_step=17660/global_step=17660, RunningAvgSamplesPerSec=2.636499537298443, CurrSamplesPerSec=2.5708456789866676, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:43:03,738] [INFO] [logging.py:96:log_dist] [Rank 0] step=17670, skipped=0, lr=[9.231666983348648e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:43:03,740] [INFO] [timer.py:259:stop] epoch=0/micro_step=17670/global_step=17670, RunningAvgSamplesPerSec=2.636505232577975, CurrSamplesPerSec=2.6132429126196266, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:43:18,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=17680, skipped=0, lr=[9.23076929571701e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:43:18,933] [INFO] [timer.py:259:stop] epoch=0/micro_step=17680/global_step=17680, RunningAvgSamplesPerSec=2.6365107280347804, CurrSamplesPerSec=2.5838799959620995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:43:34,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=17690, skipped=0, lr=[9.229871127680153e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:43:34,112] [INFO] [timer.py:259:stop] epoch=0/micro_step=17690/global_step=17690, RunningAvgSamplesPerSec=2.6365170786741197, CurrSamplesPerSec=2.635521014606434, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:43:49,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=17700, skipped=0, lr=[9.22897247934006e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:43:49,269] [INFO] [timer.py:259:stop] epoch=0/micro_step=17700/global_step=17700, RunningAvgSamplesPerSec=2.6365248309224607, CurrSamplesPerSec=2.6388879169291086, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:44:04,366] [INFO] [logging.py:96:log_dist] [Rank 0] step=17710, skipped=0, lr=[9.228073350798783e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:44:04,384] [INFO] [timer.py:259:stop] epoch=0/micro_step=17710/global_step=17710, RunningAvgSamplesPerSec=2.6365358369742378, CurrSamplesPerSec=2.6638619202044076, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:44:19,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=17720, skipped=0, lr=[9.227173742158411e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:44:19,516] [INFO] [timer.py:259:stop] epoch=0/micro_step=17720/global_step=17720, RunningAvgSamplesPerSec=2.636546690618342, CurrSamplesPerSec=2.6305387069110466, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:44:34,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=17730, skipped=0, lr=[9.226273653521096e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:44:34,654] [INFO] [timer.py:259:stop] epoch=0/micro_step=17730/global_step=17730, RunningAvgSamplesPerSec=2.6365556646197783, CurrSamplesPerSec=2.6476733986092307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:44:49,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=17740, skipped=0, lr=[9.225373084989044e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:44:49,793] [INFO] [timer.py:259:stop] epoch=0/micro_step=17740/global_step=17740, RunningAvgSamplesPerSec=2.636564433283461, CurrSamplesPerSec=2.677171945700684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:45:04,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=17750, skipped=0, lr=[9.224472036664514e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:45:04,896] [INFO] [timer.py:259:stop] epoch=0/micro_step=17750/global_step=17750, RunningAvgSamplesPerSec=2.6365771813816106, CurrSamplesPerSec=2.6565592171096744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:45:20,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=17760, skipped=0, lr=[9.223570508649821e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:45:20,128] [INFO] [timer.py:259:stop] epoch=0/micro_step=17760/global_step=17760, RunningAvgSamplesPerSec=2.636578777384881, CurrSamplesPerSec=2.6611488336552154, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:45:35,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=17770, skipped=0, lr=[9.222668501047334e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:45:35,297] [INFO] [timer.py:259:stop] epoch=0/micro_step=17770/global_step=17770, RunningAvgSamplesPerSec=2.6365860910038963, CurrSamplesPerSec=2.646789964085417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:45:50,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=17780, skipped=0, lr=[9.221766013959476e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:45:50,398] [INFO] [timer.py:259:stop] epoch=0/micro_step=17780/global_step=17780, RunningAvgSamplesPerSec=2.636598647688513, CurrSamplesPerSec=2.6562029758035064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:46:05,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=17790, skipped=0, lr=[9.220863047488725e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:46:05,570] [INFO] [timer.py:259:stop] epoch=0/micro_step=17790/global_step=17790, RunningAvgSamplesPerSec=2.636605043762148, CurrSamplesPerSec=2.642662981002749, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:46:20,740] [INFO] [logging.py:96:log_dist] [Rank 0] step=17800, skipped=0, lr=[9.219959601737613e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:46:20,753] [INFO] [timer.py:259:stop] epoch=0/micro_step=17800/global_step=17800, RunningAvgSamplesPerSec=2.636610306788607, CurrSamplesPerSec=2.6687977347789533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:46:35,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=17810, skipped=0, lr=[9.219055676808725e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:46:35,861] [INFO] [timer.py:259:stop] epoch=0/micro_step=17810/global_step=17810, RunningAvgSamplesPerSec=2.636622264304501, CurrSamplesPerSec=2.665341847444607, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:46:50,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=17820, skipped=0, lr=[9.218151272804705e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:46:50,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=17820/global_step=17820, RunningAvgSamplesPerSec=2.6366325770361487, CurrSamplesPerSec=2.663464393267241, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:47:06,144] [INFO] [logging.py:96:log_dist] [Rank 0] step=17830, skipped=0, lr=[9.217246389828245e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:47:06,158] [INFO] [timer.py:259:stop] epoch=0/micro_step=17830/global_step=17830, RunningAvgSamplesPerSec=2.6366390771372354, CurrSamplesPerSec=2.671008409502724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:47:21,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=17840, skipped=0, lr=[9.216341027982098e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:47:21,316] [INFO] [timer.py:259:stop] epoch=0/micro_step=17840/global_step=17840, RunningAvgSamplesPerSec=2.636646997892607, CurrSamplesPerSec=2.66061371468446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:47:36,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=17850, skipped=0, lr=[9.215435187369066e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:47:36,497] [INFO] [timer.py:259:stop] epoch=0/micro_step=17850/global_step=17850, RunningAvgSamplesPerSec=2.6366518531593726, CurrSamplesPerSec=2.6464893549107753, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:47:51,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=17860, skipped=0, lr=[9.214528868092009e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:47:51,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=17860/global_step=17860, RunningAvgSamplesPerSec=2.6366573399439925, CurrSamplesPerSec=2.6547940886314954, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:48:06,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=17870, skipped=0, lr=[9.213622070253839e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:48:06,936] [INFO] [timer.py:259:stop] epoch=0/micro_step=17870/global_step=17870, RunningAvgSamplesPerSec=2.636655735043542, CurrSamplesPerSec=2.6564612096788127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:48:22,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=17880, skipped=0, lr=[9.21271479395752e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:48:22,152] [INFO] [timer.py:259:stop] epoch=0/micro_step=17880/global_step=17880, RunningAvgSamplesPerSec=2.6366596817002264, CurrSamplesPerSec=2.6285678569399913, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:48:37,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=17890, skipped=0, lr=[9.21180703930608e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:48:37,305] [INFO] [timer.py:259:stop] epoch=0/micro_step=17890/global_step=17890, RunningAvgSamplesPerSec=2.636667033210621, CurrSamplesPerSec=2.6441511120328784, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:48:52,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=17900, skipped=0, lr=[9.21089880640259e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:48:52,546] [INFO] [timer.py:259:stop] epoch=0/micro_step=17900/global_step=17900, RunningAvgSamplesPerSec=2.636667588281051, CurrSamplesPerSec=2.667323721805752, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:49:07,696] [INFO] [logging.py:96:log_dist] [Rank 0] step=17910, skipped=0, lr=[9.209990095350181e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:49:07,697] [INFO] [timer.py:259:stop] epoch=0/micro_step=17910/global_step=17910, RunningAvgSamplesPerSec=2.6366766213392645, CurrSamplesPerSec=2.6601437640627594, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:49:22,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=17920, skipped=0, lr=[9.20908090625204e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:49:22,885] [INFO] [timer.py:259:stop] epoch=0/micro_step=17920/global_step=17920, RunningAvgSamplesPerSec=2.636682029615867, CurrSamplesPerSec=2.6388115461454, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:49:38,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=17930, skipped=0, lr=[9.208171239211403e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:49:38,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=17930/global_step=17930, RunningAvgSamplesPerSec=2.636686607053482, CurrSamplesPerSec=2.6434953448889558, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:49:53,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=17940, skipped=0, lr=[9.207261094331563e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:49:53,228] [INFO] [timer.py:259:stop] epoch=0/micro_step=17940/global_step=17940, RunningAvgSamplesPerSec=2.6366959135018937, CurrSamplesPerSec=2.6430992924889236, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:50:08,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=17950, skipped=0, lr=[9.206350471715867e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:50:08,432] [INFO] [timer.py:259:stop] epoch=0/micro_step=17950/global_step=17950, RunningAvgSamplesPerSec=2.636699168163844, CurrSamplesPerSec=2.617724796412467, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:50:23,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=17960, skipped=0, lr=[9.205439371467722e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:50:23,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=17960/global_step=17960, RunningAvgSamplesPerSec=2.636709765685399, CurrSamplesPerSec=2.66062173144044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:50:38,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=17970, skipped=0, lr=[9.204527793690573e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:50:38,685] [INFO] [timer.py:259:stop] epoch=0/micro_step=17970/global_step=17970, RunningAvgSamplesPerSec=2.6367227222424825, CurrSamplesPerSec=2.6492851483290547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:50:53,830] [INFO] [logging.py:96:log_dist] [Rank 0] step=17980, skipped=0, lr=[9.20361573848794e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:50:53,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=17980/global_step=17980, RunningAvgSamplesPerSec=2.6367314535055693, CurrSamplesPerSec=2.668728113188036, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:51:09,024] [INFO] [logging.py:96:log_dist] [Rank 0] step=17990, skipped=0, lr=[9.202703205963384e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:51:09,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=17990/global_step=17990, RunningAvgSamplesPerSec=2.6367367773720445, CurrSamplesPerSec=2.6174229941836518, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:51:24,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=18000, skipped=0, lr=[9.201790196220522e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:51:24,196] [INFO] [timer.py:259:stop] epoch=0/micro_step=18000/global_step=18000, RunningAvgSamplesPerSec=2.636743521761648, CurrSamplesPerSec=2.655336114680427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:51:39,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=18010, skipped=0, lr=[9.200876709363027e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:51:39,349] [INFO] [timer.py:259:stop] epoch=0/micro_step=18010/global_step=18010, RunningAvgSamplesPerSec=2.6367511602560736, CurrSamplesPerSec=2.6522818127688943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:51:54,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=18020, skipped=0, lr=[9.199962745494625e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:51:54,546] [INFO] [timer.py:259:stop] epoch=0/micro_step=18020/global_step=18020, RunningAvgSamplesPerSec=2.636757103756488, CurrSamplesPerSec=2.660596415533528, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:52:09,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=18030, skipped=0, lr=[9.199048304719097e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:52:09,700] [INFO] [timer.py:259:stop] epoch=0/micro_step=18030/global_step=18030, RunningAvgSamplesPerSec=2.6367650124131785, CurrSamplesPerSec=2.6215931197563687, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:52:24,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=18040, skipped=0, lr=[9.198133387140282e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:52:24,852] [INFO] [timer.py:259:stop] epoch=0/micro_step=18040/global_step=18040, RunningAvgSamplesPerSec=2.636773483367981, CurrSamplesPerSec=2.6644436228611545, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:52:39,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=18050, skipped=0, lr=[9.197217992862063e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:52:39,898] [INFO] [timer.py:259:stop] epoch=0/micro_step=18050/global_step=18050, RunningAvgSamplesPerSec=2.636791726699064, CurrSamplesPerSec=2.668365205792613, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:52:54,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=18060, skipped=0, lr=[9.19630212198839e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:52:54,975] [INFO] [timer.py:259:stop] epoch=0/micro_step=18060/global_step=18060, RunningAvgSamplesPerSec=2.636806283391783, CurrSamplesPerSec=2.6574310863728474, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:53:10,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=18070, skipped=0, lr=[9.195385774623254e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:53:10,173] [INFO] [timer.py:259:stop] epoch=0/micro_step=18070/global_step=18070, RunningAvgSamplesPerSec=2.6368107378176884, CurrSamplesPerSec=2.64170052452395, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:53:25,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=18080, skipped=0, lr=[9.19446895087071e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:53:25,241] [INFO] [timer.py:259:stop] epoch=0/micro_step=18080/global_step=18080, RunningAvgSamplesPerSec=2.6368256549666063, CurrSamplesPerSec=2.6715026258344623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:53:40,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=18090, skipped=0, lr=[9.193551650834864e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:53:40,305] [INFO] [timer.py:259:stop] epoch=0/micro_step=18090/global_step=18090, RunningAvgSamplesPerSec=2.6368419977386157, CurrSamplesPerSec=2.6706992982874196, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:53:55,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=18100, skipped=0, lr=[9.192633874619875e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:53:55,382] [INFO] [timer.py:259:stop] epoch=0/micro_step=18100/global_step=18100, RunningAvgSamplesPerSec=2.6368557456769857, CurrSamplesPerSec=2.675995523111306, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:54:10,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=18110, skipped=0, lr=[9.191715622329953e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:54:10,564] [INFO] [timer.py:259:stop] epoch=0/micro_step=18110/global_step=18110, RunningAvgSamplesPerSec=2.6368616018208146, CurrSamplesPerSec=2.6444661959171114, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:54:25,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=18120, skipped=0, lr=[9.190796894069372e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:54:25,759] [INFO] [timer.py:259:stop] epoch=0/micro_step=18120/global_step=18120, RunningAvgSamplesPerSec=2.6368659324724244, CurrSamplesPerSec=2.6333025365264655, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:54:40,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=18130, skipped=0, lr=[9.189877689942453e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:54:40,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=18130/global_step=18130, RunningAvgSamplesPerSec=2.6368713930764276, CurrSamplesPerSec=2.65788449932582, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:54:56,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=18140, skipped=0, lr=[9.188958010053568e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:54:56,143] [INFO] [timer.py:259:stop] epoch=0/micro_step=18140/global_step=18140, RunningAvgSamplesPerSec=2.636875361915489, CurrSamplesPerSec=2.633159123883021, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:55:11,283] [INFO] [logging.py:96:log_dist] [Rank 0] step=18150, skipped=0, lr=[9.188037854507148e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:55:11,284] [INFO] [timer.py:259:stop] epoch=0/micro_step=18150/global_step=18150, RunningAvgSamplesPerSec=2.6368856944911068, CurrSamplesPerSec=2.659528523222579, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:55:26,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=18160, skipped=0, lr=[9.18711722340768e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:55:26,430] [INFO] [timer.py:259:stop] epoch=0/micro_step=18160/global_step=18160, RunningAvgSamplesPerSec=2.6368938451288684, CurrSamplesPerSec=2.6430813875611063, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:55:41,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=18170, skipped=0, lr=[9.186196116859699e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:55:41,603] [INFO] [timer.py:259:stop] epoch=0/micro_step=18170/global_step=18170, RunningAvgSamplesPerSec=2.6369005850555216, CurrSamplesPerSec=2.654448820002927, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:55:56,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=18180, skipped=0, lr=[9.185274534967799e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:55:56,748] [INFO] [timer.py:259:stop] epoch=0/micro_step=18180/global_step=18180, RunningAvgSamplesPerSec=2.6369099403346032, CurrSamplesPerSec=2.6591748569761573, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:56:11,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=18190, skipped=0, lr=[9.184352477836624e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:56:11,913] [INFO] [timer.py:259:stop] epoch=0/micro_step=18190/global_step=18190, RunningAvgSamplesPerSec=2.6369166473340977, CurrSamplesPerSec=2.6466208630222074, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:56:26,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=18200, skipped=0, lr=[9.183429945570874e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:56:26,998] [INFO] [timer.py:259:stop] epoch=0/micro_step=18200/global_step=18200, RunningAvgSamplesPerSec=2.6369315506510795, CurrSamplesPerSec=2.682255879612104, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:56:42,145] [INFO] [logging.py:96:log_dist] [Rank 0] step=18210, skipped=0, lr=[9.182506938275306e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:56:42,157] [INFO] [timer.py:259:stop] epoch=0/micro_step=18210/global_step=18210, RunningAvgSamplesPerSec=2.6369387440999774, CurrSamplesPerSec=2.66652460036953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:56:57,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=18220, skipped=0, lr=[9.181583456054723e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:56:57,319] [INFO] [timer.py:259:stop] epoch=0/micro_step=18220/global_step=18220, RunningAvgSamplesPerSec=2.6369471742316204, CurrSamplesPerSec=2.6492412225359048, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:57:12,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=18230, skipped=0, lr=[9.18065949901399e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:57:12,502] [INFO] [timer.py:259:stop] epoch=0/micro_step=18230/global_step=18230, RunningAvgSamplesPerSec=2.636954591570776, CurrSamplesPerSec=2.657125530378517, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:57:27,691] [INFO] [logging.py:96:log_dist] [Rank 0] step=18240, skipped=0, lr=[9.179735067258021e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:57:27,693] [INFO] [timer.py:259:stop] epoch=0/micro_step=18240/global_step=18240, RunningAvgSamplesPerSec=2.6369601987438505, CurrSamplesPerSec=2.6514736587856356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:57:42,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=18250, skipped=0, lr=[9.178810160891789e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:57:42,875] [INFO] [timer.py:259:stop] epoch=0/micro_step=18250/global_step=18250, RunningAvgSamplesPerSec=2.6369683187789694, CurrSamplesPerSec=2.6364781441207623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:57:58,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=18260, skipped=0, lr=[9.177884780020314e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:57:58,004] [INFO] [timer.py:259:stop] epoch=0/micro_step=18260/global_step=18260, RunningAvgSamplesPerSec=2.636978961418397, CurrSamplesPerSec=2.666270338561766, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:58:13,118] [INFO] [logging.py:96:log_dist] [Rank 0] step=18270, skipped=0, lr=[9.176958924748673e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:58:13,127] [INFO] [timer.py:259:stop] epoch=0/micro_step=18270/global_step=18270, RunningAvgSamplesPerSec=2.636989134342959, CurrSamplesPerSec=2.6746896462920393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:58:28,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=18280, skipped=0, lr=[9.176032595181997e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:58:28,262] [INFO] [timer.py:259:stop] epoch=0/micro_step=18280/global_step=18280, RunningAvgSamplesPerSec=2.6370004974900563, CurrSamplesPerSec=2.6445608190973484, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:58:43,367] [INFO] [logging.py:96:log_dist] [Rank 0] step=18290, skipped=0, lr=[9.175105791425473e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:58:43,368] [INFO] [timer.py:259:stop] epoch=0/micro_step=18290/global_step=18290, RunningAvgSamplesPerSec=2.6370124971580795, CurrSamplesPerSec=2.6430688958946615, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:58:58,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=18300, skipped=0, lr=[9.17417851358434e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:58:58,498] [INFO] [timer.py:259:stop] epoch=0/micro_step=18300/global_step=18300, RunningAvgSamplesPerSec=2.6370216446774166, CurrSamplesPerSec=2.583779717314269, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:59:13,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=18310, skipped=0, lr=[9.17325076176389e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:59:13,640] [INFO] [timer.py:259:stop] epoch=0/micro_step=18310/global_step=18310, RunningAvgSamplesPerSec=2.6370298828623513, CurrSamplesPerSec=2.6468734787538786, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:59:28,794] [INFO] [logging.py:96:log_dist] [Rank 0] step=18320, skipped=0, lr=[9.172322536069467e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:59:28,795] [INFO] [timer.py:259:stop] epoch=0/micro_step=18320/global_step=18320, RunningAvgSamplesPerSec=2.6370386985840333, CurrSamplesPerSec=2.6189412868174884, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:59:43,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=18330, skipped=0, lr=[9.171393836606474e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:59:43,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=18330/global_step=18330, RunningAvgSamplesPerSec=2.6370557960327323, CurrSamplesPerSec=2.677180489755312, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 07:59:58,996] [INFO] [logging.py:96:log_dist] [Rank 0] step=18340, skipped=0, lr=[9.170464663480365e-06], mom=[(0.9, 0.95)] +[2024-11-01 07:59:59,004] [INFO] [timer.py:259:stop] epoch=0/micro_step=18340/global_step=18340, RunningAvgSamplesPerSec=2.6370626854693886, CurrSamplesPerSec=2.635297880958553, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:00:14,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=18350, skipped=0, lr=[9.16953501679665e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:00:14,116] [INFO] [timer.py:259:stop] epoch=0/micro_step=18350/global_step=18350, RunningAvgSamplesPerSec=2.637073770101645, CurrSamplesPerSec=2.6742436962944507, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:00:29,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=18360, skipped=0, lr=[9.168604896660884e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:00:29,264] [INFO] [timer.py:259:stop] epoch=0/micro_step=18360/global_step=18360, RunningAvgSamplesPerSec=2.6370817946017335, CurrSamplesPerSec=2.636740430276625, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:00:44,381] [INFO] [logging.py:96:log_dist] [Rank 0] step=18370, skipped=0, lr=[9.16767430317869e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:00:44,383] [INFO] [timer.py:259:stop] epoch=0/micro_step=18370/global_step=18370, RunningAvgSamplesPerSec=2.6370926579502068, CurrSamplesPerSec=2.665964441126744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:00:59,524] [INFO] [logging.py:96:log_dist] [Rank 0] step=18380, skipped=0, lr=[9.166743236455733e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:00:59,525] [INFO] [timer.py:259:stop] epoch=0/micro_step=18380/global_step=18380, RunningAvgSamplesPerSec=2.63710100629467, CurrSamplesPerSec=2.647528416430757, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:01:14,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=18390, skipped=0, lr=[9.165811696597737e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:01:14,652] [INFO] [timer.py:259:stop] epoch=0/micro_step=18390/global_step=18390, RunningAvgSamplesPerSec=2.63711151329931, CurrSamplesPerSec=2.6726342274863257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:01:29,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=18400, skipped=0, lr=[9.164879683710479e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:01:29,746] [INFO] [timer.py:259:stop] epoch=0/micro_step=18400/global_step=18400, RunningAvgSamplesPerSec=2.6371244252463564, CurrSamplesPerSec=2.6714949687652068, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:01:44,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=18410, skipped=0, lr=[9.163947197899787e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:01:44,854] [INFO] [timer.py:259:stop] epoch=0/micro_step=18410/global_step=18410, RunningAvgSamplesPerSec=2.6371353981428935, CurrSamplesPerSec=2.6639524376008294, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:01:59,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=18420, skipped=0, lr=[9.163014239271548e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:01:59,925] [INFO] [timer.py:259:stop] epoch=0/micro_step=18420/global_step=18420, RunningAvgSamplesPerSec=2.6371501069418293, CurrSamplesPerSec=2.6482865103014355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:02:15,052] [INFO] [logging.py:96:log_dist] [Rank 0] step=18430, skipped=0, lr=[9.1620808079317e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:02:15,054] [INFO] [timer.py:259:stop] epoch=0/micro_step=18430/global_step=18430, RunningAvgSamplesPerSec=2.637159102207708, CurrSamplesPerSec=2.6568036362400695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:02:30,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=18440, skipped=0, lr=[9.161146903986232e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:02:30,239] [INFO] [timer.py:259:stop] epoch=0/micro_step=18440/global_step=18440, RunningAvgSamplesPerSec=2.6371640067632063, CurrSamplesPerSec=2.630565516322499, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:02:45,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=18450, skipped=0, lr=[9.160212527541188e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:02:45,352] [INFO] [timer.py:259:stop] epoch=0/micro_step=18450/global_step=18450, RunningAvgSamplesPerSec=2.6371745092426884, CurrSamplesPerSec=2.6495893221159608, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:03:00,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=18460, skipped=0, lr=[9.159277678702672e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:03:00,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=18460/global_step=18460, RunningAvgSamplesPerSec=2.637178992151131, CurrSamplesPerSec=2.6452584050190224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:03:15,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=18470, skipped=0, lr=[9.158342357576832e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:03:15,628] [INFO] [timer.py:259:stop] epoch=0/micro_step=18470/global_step=18470, RunningAvgSamplesPerSec=2.6371916676965363, CurrSamplesPerSec=2.6519476769423185, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:03:30,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=18480, skipped=0, lr=[9.157406564269874e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:03:30,775] [INFO] [timer.py:259:stop] epoch=0/micro_step=18480/global_step=18480, RunningAvgSamplesPerSec=2.6371993565727387, CurrSamplesPerSec=2.6542925965632014, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:03:45,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=18490, skipped=0, lr=[9.156470298888061e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:03:45,881] [INFO] [timer.py:259:stop] epoch=0/micro_step=18490/global_step=18490, RunningAvgSamplesPerSec=2.6372108801260192, CurrSamplesPerSec=2.65786597245975, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:04:00,999] [INFO] [logging.py:96:log_dist] [Rank 0] step=18500, skipped=0, lr=[9.155533561537703e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:04:01,014] [INFO] [timer.py:259:stop] epoch=0/micro_step=18500/global_step=18500, RunningAvgSamplesPerSec=2.6372201387227556, CurrSamplesPerSec=2.668255716278734, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:04:16,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=18510, skipped=0, lr=[9.15459635232517e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:04:16,205] [INFO] [timer.py:259:stop] epoch=0/micro_step=18510/global_step=18510, RunningAvgSamplesPerSec=2.6372247323342815, CurrSamplesPerSec=2.6552049997171925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:04:31,359] [INFO] [logging.py:96:log_dist] [Rank 0] step=18520, skipped=0, lr=[9.15365867135688e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:04:31,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=18520/global_step=18520, RunningAvgSamplesPerSec=2.6372321363909164, CurrSamplesPerSec=2.6605538015115906, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:04:46,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=18530, skipped=0, lr=[9.152720518739307e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:04:46,528] [INFO] [timer.py:259:stop] epoch=0/micro_step=18530/global_step=18530, RunningAvgSamplesPerSec=2.6372393298745362, CurrSamplesPerSec=2.633517064710367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:05:01,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=18540, skipped=0, lr=[9.151781894578977e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:05:01,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=18540/global_step=18540, RunningAvgSamplesPerSec=2.637245551951027, CurrSamplesPerSec=2.636213424760256, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:05:16,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=18550, skipped=0, lr=[9.150842798982476e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:05:16,858] [INFO] [timer.py:259:stop] epoch=0/micro_step=18550/global_step=18550, RunningAvgSamplesPerSec=2.6372532830217095, CurrSamplesPerSec=2.675608447924397, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:05:32,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=18560, skipped=0, lr=[9.149903232056435e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:05:32,083] [INFO] [timer.py:259:stop] epoch=0/micro_step=18560/global_step=18560, RunningAvgSamplesPerSec=2.6372552740790702, CurrSamplesPerSec=2.660118879026993, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:05:47,451] [INFO] [logging.py:96:log_dist] [Rank 0] step=18570, skipped=0, lr=[9.148963193907541e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:05:47,453] [INFO] [timer.py:259:stop] epoch=0/micro_step=18570/global_step=18570, RunningAvgSamplesPerSec=2.637243363124321, CurrSamplesPerSec=2.6184753154818425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:06:02,660] [INFO] [logging.py:96:log_dist] [Rank 0] step=18580, skipped=0, lr=[9.14802268464254e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:06:02,662] [INFO] [timer.py:259:stop] epoch=0/micro_step=18580/global_step=18580, RunningAvgSamplesPerSec=2.637245719567735, CurrSamplesPerSec=2.6462305517213083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:06:17,794] [INFO] [logging.py:96:log_dist] [Rank 0] step=18590, skipped=0, lr=[9.147081704368222e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:06:17,795] [INFO] [timer.py:259:stop] epoch=0/micro_step=18590/global_step=18590, RunningAvgSamplesPerSec=2.6372545335316904, CurrSamplesPerSec=2.643795690859036, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:06:32,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=18600, skipped=0, lr=[9.146140253191438e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:06:32,938] [INFO] [timer.py:259:stop] epoch=0/micro_step=18600/global_step=18600, RunningAvgSamplesPerSec=2.637263285849299, CurrSamplesPerSec=2.675735611322675, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:06:48,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=18610, skipped=0, lr=[9.145198331219091e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:06:48,114] [INFO] [timer.py:259:stop] epoch=0/micro_step=18610/global_step=18610, RunningAvgSamplesPerSec=2.6372675749525, CurrSamplesPerSec=2.609657317980124, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:07:03,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=18620, skipped=0, lr=[9.144255938558137e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:07:03,262] [INFO] [timer.py:259:stop] epoch=0/micro_step=18620/global_step=18620, RunningAvgSamplesPerSec=2.6372752574620084, CurrSamplesPerSec=2.657096493589209, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:07:18,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=18630, skipped=0, lr=[9.143313075315583e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:07:18,381] [INFO] [timer.py:259:stop] epoch=0/micro_step=18630/global_step=18630, RunningAvgSamplesPerSec=2.6372858136136377, CurrSamplesPerSec=2.618074876150994, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:07:33,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=18640, skipped=0, lr=[9.142369741598492e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:07:33,517] [INFO] [timer.py:259:stop] epoch=0/micro_step=18640/global_step=18640, RunningAvgSamplesPerSec=2.637295332484841, CurrSamplesPerSec=2.6431280241591204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:07:48,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=18650, skipped=0, lr=[9.14142593751398e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:07:48,601] [INFO] [timer.py:259:stop] epoch=0/micro_step=18650/global_step=18650, RunningAvgSamplesPerSec=2.63730844947981, CurrSamplesPerSec=2.6068979111163353, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:08:03,759] [INFO] [logging.py:96:log_dist] [Rank 0] step=18660, skipped=0, lr=[9.140481663169216e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:08:03,760] [INFO] [timer.py:259:stop] epoch=0/micro_step=18660/global_step=18660, RunningAvgSamplesPerSec=2.6373150734068944, CurrSamplesPerSec=2.654091464967758, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:08:18,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=18670, skipped=0, lr=[9.139536918671424e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:08:18,910] [INFO] [timer.py:259:stop] epoch=0/micro_step=18670/global_step=18670, RunningAvgSamplesPerSec=2.637323250069334, CurrSamplesPerSec=2.618538252799795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:08:34,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=18680, skipped=0, lr=[9.138591704127879e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:08:34,049] [INFO] [timer.py:259:stop] epoch=0/micro_step=18680/global_step=18680, RunningAvgSamplesPerSec=2.637333709767192, CurrSamplesPerSec=2.6531374500295217, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:08:49,243] [INFO] [logging.py:96:log_dist] [Rank 0] step=18690, skipped=0, lr=[9.137646019645909e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:08:49,245] [INFO] [timer.py:259:stop] epoch=0/micro_step=18690/global_step=18690, RunningAvgSamplesPerSec=2.637338659144757, CurrSamplesPerSec=2.6458324275984006, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:09:04,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=18700, skipped=0, lr=[9.136699865332898e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:09:04,401] [INFO] [timer.py:259:stop] epoch=0/micro_step=18700/global_step=18700, RunningAvgSamplesPerSec=2.6373455444229155, CurrSamplesPerSec=2.6469578339188224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:09:19,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=18710, skipped=0, lr=[9.135753241296284e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:09:19,555] [INFO] [timer.py:259:stop] epoch=0/micro_step=18710/global_step=18710, RunningAvgSamplesPerSec=2.6373564220634655, CurrSamplesPerSec=2.6686839647929808, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:09:34,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=18720, skipped=0, lr=[9.134806147643554e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:09:34,686] [INFO] [timer.py:259:stop] epoch=0/micro_step=18720/global_step=18720, RunningAvgSamplesPerSec=2.6373677201949923, CurrSamplesPerSec=2.6489718428652953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:09:49,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=18730, skipped=0, lr=[9.133858584482252e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:09:49,855] [INFO] [timer.py:259:stop] epoch=0/micro_step=18730/global_step=18730, RunningAvgSamplesPerSec=2.637376164402878, CurrSamplesPerSec=2.6744142141051137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:10:04,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=18740, skipped=0, lr=[9.132910551919973e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:10:04,990] [INFO] [timer.py:259:stop] epoch=0/micro_step=18740/global_step=18740, RunningAvgSamplesPerSec=2.637384906938382, CurrSamplesPerSec=2.637031782389308, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:10:20,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=18750, skipped=0, lr=[9.13196205006437e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:10:20,166] [INFO] [timer.py:259:stop] epoch=0/micro_step=18750/global_step=18750, RunningAvgSamplesPerSec=2.6373914716032254, CurrSamplesPerSec=2.6473099286207646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:10:35,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=18760, skipped=0, lr=[9.13101307902314e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:10:35,459] [INFO] [timer.py:259:stop] epoch=0/micro_step=18760/global_step=18760, RunningAvgSamplesPerSec=2.637387302560973, CurrSamplesPerSec=2.6136198882691772, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:10:50,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=18770, skipped=0, lr=[9.130063638904041e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:10:50,715] [INFO] [timer.py:259:stop] epoch=0/micro_step=18770/global_step=18770, RunningAvgSamplesPerSec=2.6373897167421907, CurrSamplesPerSec=2.643561573433156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:11:05,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=18780, skipped=0, lr=[9.129113729814884e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:11:05,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=18780/global_step=18780, RunningAvgSamplesPerSec=2.6373932359299475, CurrSamplesPerSec=2.611801150179765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:11:21,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=18790, skipped=0, lr=[9.12816335186353e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:11:21,198] [INFO] [timer.py:259:stop] epoch=0/micro_step=18790/global_step=18790, RunningAvgSamplesPerSec=2.637393312716075, CurrSamplesPerSec=2.6474156170735452, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:11:36,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=18800, skipped=0, lr=[9.127212505157896e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:11:36,322] [INFO] [timer.py:259:stop] epoch=0/micro_step=18800/global_step=18800, RunningAvgSamplesPerSec=2.6374038382661227, CurrSamplesPerSec=2.650844829991537, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:11:51,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=18810, skipped=0, lr=[9.12626118980595e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:11:51,586] [INFO] [timer.py:259:stop] epoch=0/micro_step=18810/global_step=18810, RunningAvgSamplesPerSec=2.637403708711447, CurrSamplesPerSec=2.650454528077772, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:12:06,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=18820, skipped=0, lr=[9.125309405915714e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:12:06,818] [INFO] [timer.py:259:stop] epoch=0/micro_step=18820/global_step=18820, RunningAvgSamplesPerSec=2.637405026795301, CurrSamplesPerSec=2.6475747922880024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:12:21,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=18830, skipped=0, lr=[9.124357153595262e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:12:21,996] [INFO] [timer.py:259:stop] epoch=0/micro_step=18830/global_step=18830, RunningAvgSamplesPerSec=2.6374104377849523, CurrSamplesPerSec=2.6568267763538533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:12:37,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=18840, skipped=0, lr=[9.123404432952726e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:12:37,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=18840/global_step=18840, RunningAvgSamplesPerSec=2.637415624325468, CurrSamplesPerSec=2.6535226669339504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:12:52,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=18850, skipped=0, lr=[9.122451244096287e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:12:52,383] [INFO] [timer.py:259:stop] epoch=0/micro_step=18850/global_step=18850, RunningAvgSamplesPerSec=2.6374190977165703, CurrSamplesPerSec=2.652363577768358, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:13:07,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=18860, skipped=0, lr=[9.121497587134176e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:13:07,599] [INFO] [timer.py:259:stop] epoch=0/micro_step=18860/global_step=18860, RunningAvgSamplesPerSec=2.6374215269955523, CurrSamplesPerSec=2.636727584083731, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:13:22,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=18870, skipped=0, lr=[9.120543462174687e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:13:22,875] [INFO] [timer.py:259:stop] epoch=0/micro_step=18870/global_step=18870, RunningAvgSamplesPerSec=2.6374210832762985, CurrSamplesPerSec=2.6441177742159105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:13:38,096] [INFO] [logging.py:96:log_dist] [Rank 0] step=18880, skipped=0, lr=[9.119588869326157e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:13:38,105] [INFO] [timer.py:259:stop] epoch=0/micro_step=18880/global_step=18880, RunningAvgSamplesPerSec=2.637421096672872, CurrSamplesPerSec=2.629592066445326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:13:53,297] [INFO] [logging.py:96:log_dist] [Rank 0] step=18890, skipped=0, lr=[9.118633808696981e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:13:53,300] [INFO] [timer.py:259:stop] epoch=0/micro_step=18890/global_step=18890, RunningAvgSamplesPerSec=2.6374237007075254, CurrSamplesPerSec=2.6488564112043127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:14:08,560] [INFO] [logging.py:96:log_dist] [Rank 0] step=18900, skipped=0, lr=[9.117678280395606e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:14:08,561] [INFO] [timer.py:259:stop] epoch=0/micro_step=18900/global_step=18900, RunningAvgSamplesPerSec=2.6374225702568066, CurrSamplesPerSec=2.579127323020325, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:14:23,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=18910, skipped=0, lr=[9.116722284530536e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:14:23,727] [INFO] [timer.py:259:stop] epoch=0/micro_step=18910/global_step=18910, RunningAvgSamplesPerSec=2.637429079350559, CurrSamplesPerSec=2.6504101447959227, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:14:38,933] [INFO] [logging.py:96:log_dist] [Rank 0] step=18920, skipped=0, lr=[9.11576582121032e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:14:38,934] [INFO] [timer.py:259:stop] epoch=0/micro_step=18920/global_step=18920, RunningAvgSamplesPerSec=2.6374325389051756, CurrSamplesPerSec=2.597991022466744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:14:54,130] [INFO] [logging.py:96:log_dist] [Rank 0] step=18930, skipped=0, lr=[9.114808890543568e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:14:54,131] [INFO] [timer.py:259:stop] epoch=0/micro_step=18930/global_step=18930, RunningAvgSamplesPerSec=2.637435761395867, CurrSamplesPerSec=2.635422897383318, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:15:09,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=18940, skipped=0, lr=[9.113851492638936e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:15:09,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=18940/global_step=18940, RunningAvgSamplesPerSec=2.6374362811182293, CurrSamplesPerSec=2.593413629566817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:15:24,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=18950, skipped=0, lr=[9.112893627605144e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:15:24,623] [INFO] [timer.py:259:stop] epoch=0/micro_step=18950/global_step=18950, RunningAvgSamplesPerSec=2.6374362976917873, CurrSamplesPerSec=2.6486636293052173, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:15:39,808] [INFO] [logging.py:96:log_dist] [Rank 0] step=18960, skipped=0, lr=[9.111935295550948e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:15:39,829] [INFO] [timer.py:259:stop] epoch=0/micro_step=18960/global_step=18960, RunningAvgSamplesPerSec=2.6374393573281587, CurrSamplesPerSec=2.6527896759552942, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:15:55,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=18970, skipped=0, lr=[9.110976496585175e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:15:55,039] [INFO] [timer.py:259:stop] epoch=0/micro_step=18970/global_step=18970, RunningAvgSamplesPerSec=2.6374416713146265, CurrSamplesPerSec=2.653741761732792, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:16:10,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=18980, skipped=0, lr=[9.110017230816694e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:16:10,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=18980/global_step=18980, RunningAvgSamplesPerSec=2.6374437090772367, CurrSamplesPerSec=2.6478652005837002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:16:25,437] [INFO] [logging.py:96:log_dist] [Rank 0] step=18990, skipped=0, lr=[9.109057498354433e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:16:25,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=18990/global_step=18990, RunningAvgSamplesPerSec=2.6374470814377395, CurrSamplesPerSec=2.6434895136037695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:16:40,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=19000, skipped=0, lr=[9.108097299307363e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:16:40,709] [INFO] [timer.py:259:stop] epoch=0/micro_step=19000/global_step=19000, RunningAvgSamplesPerSec=2.6374450901912416, CurrSamplesPerSec=2.656990030791075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:16:55,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=19010, skipped=0, lr=[9.107136633784518e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:16:55,949] [INFO] [timer.py:259:stop] epoch=0/micro_step=19010/global_step=19010, RunningAvgSamplesPerSec=2.637444833694056, CurrSamplesPerSec=2.6360821204843434, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:17:11,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=19020, skipped=0, lr=[9.106175501894988e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:17:11,213] [INFO] [timer.py:259:stop] epoch=0/micro_step=19020/global_step=19020, RunningAvgSamplesPerSec=2.6374438764249644, CurrSamplesPerSec=2.638931084979683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:17:26,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=19030, skipped=0, lr=[9.105213903747901e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:17:26,470] [INFO] [timer.py:259:stop] epoch=0/micro_step=19030/global_step=19030, RunningAvgSamplesPerSec=2.6374440398535017, CurrSamplesPerSec=2.6363240287315395, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:17:41,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=19040, skipped=0, lr=[9.104251839452449e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:17:41,729] [INFO] [timer.py:259:stop] epoch=0/micro_step=19040/global_step=19040, RunningAvgSamplesPerSec=2.6374431917015477, CurrSamplesPerSec=2.6406485752078472, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:17:56,936] [INFO] [logging.py:96:log_dist] [Rank 0] step=19050, skipped=0, lr=[9.103289309117879e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:17:56,937] [INFO] [timer.py:259:stop] epoch=0/micro_step=19050/global_step=19050, RunningAvgSamplesPerSec=2.6374463999259508, CurrSamplesPerSec=2.6430872170458555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:18:12,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=19060, skipped=0, lr=[9.102326312853483e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:18:12,149] [INFO] [timer.py:259:stop] epoch=0/micro_step=19060/global_step=19060, RunningAvgSamplesPerSec=2.63744975977827, CurrSamplesPerSec=2.637613021139031, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:18:27,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=19070, skipped=0, lr=[9.10136285076861e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:18:27,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=19070/global_step=19070, RunningAvgSamplesPerSec=2.637457164357249, CurrSamplesPerSec=2.6453480796281466, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:18:42,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=19080, skipped=0, lr=[9.100398922972663e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:18:42,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=19080/global_step=19080, RunningAvgSamplesPerSec=2.637453826244494, CurrSamplesPerSec=2.649892728677695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:18:57,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=19090, skipped=0, lr=[9.099434529575093e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:18:57,867] [INFO] [timer.py:259:stop] epoch=0/micro_step=19090/global_step=19090, RunningAvgSamplesPerSec=2.637452936603626, CurrSamplesPerSec=2.644936877686867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:19:13,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=19100, skipped=0, lr=[9.098469670685409e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:19:13,143] [INFO] [timer.py:259:stop] epoch=0/micro_step=19100/global_step=19100, RunningAvgSamplesPerSec=2.637450719629766, CurrSamplesPerSec=2.652054993700736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:19:28,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=19110, skipped=0, lr=[9.097504346413173e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:19:28,362] [INFO] [timer.py:259:stop] epoch=0/micro_step=19110/global_step=19110, RunningAvgSamplesPerSec=2.637453457968178, CurrSamplesPerSec=2.6113633224983195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:19:43,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=19120, skipped=0, lr=[9.096538556867995e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:19:43,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=19120/global_step=19120, RunningAvgSamplesPerSec=2.637460218944988, CurrSamplesPerSec=2.6431442640751097, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:19:58,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=19130, skipped=0, lr=[9.095572302159542e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:19:58,758] [INFO] [timer.py:259:stop] epoch=0/micro_step=19130/global_step=19130, RunningAvgSamplesPerSec=2.637461733612514, CurrSamplesPerSec=2.6453047013836257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:20:14,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=19140, skipped=0, lr=[9.094605582397533e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:20:14,018] [INFO] [timer.py:259:stop] epoch=0/micro_step=19140/global_step=19140, RunningAvgSamplesPerSec=2.6374611832935306, CurrSamplesPerSec=2.639315093735702, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:20:29,214] [INFO] [logging.py:96:log_dist] [Rank 0] step=19150, skipped=0, lr=[9.093638397691738e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:20:29,220] [INFO] [timer.py:259:stop] epoch=0/micro_step=19150/global_step=19150, RunningAvgSamplesPerSec=2.6374652423810607, CurrSamplesPerSec=2.6242421188655607, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:20:44,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=19160, skipped=0, lr=[9.092670748151983e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:20:44,475] [INFO] [timer.py:259:stop] epoch=0/micro_step=19160/global_step=19160, RunningAvgSamplesPerSec=2.6374651815643233, CurrSamplesPerSec=2.6400144842572257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:20:59,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=19170, skipped=0, lr=[9.091702633888141e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:20:59,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=19170/global_step=19170, RunningAvgSamplesPerSec=2.637469864715893, CurrSamplesPerSec=2.6371146823797624, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:21:14,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=19180, skipped=0, lr=[9.090734055010149e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:21:14,890] [INFO] [timer.py:259:stop] epoch=0/micro_step=19180/global_step=19180, RunningAvgSamplesPerSec=2.6374715492118845, CurrSamplesPerSec=2.638797849678852, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:21:30,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=19190, skipped=0, lr=[9.089765011627982e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:21:30,164] [INFO] [timer.py:259:stop] epoch=0/micro_step=19190/global_step=19190, RunningAvgSamplesPerSec=2.637468830161282, CurrSamplesPerSec=2.6423237734402893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:21:45,398] [INFO] [logging.py:96:log_dist] [Rank 0] step=19200, skipped=0, lr=[9.088795503851679e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:21:45,399] [INFO] [timer.py:259:stop] epoch=0/micro_step=19200/global_step=19200, RunningAvgSamplesPerSec=2.637470651878636, CurrSamplesPerSec=2.6522034069659988, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:22:00,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=19210, skipped=0, lr=[9.087825531791326e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:22:00,601] [INFO] [timer.py:259:stop] epoch=0/micro_step=19210/global_step=19210, RunningAvgSamplesPerSec=2.6374734791676873, CurrSamplesPerSec=2.6354034404054247, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:22:15,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=19220, skipped=0, lr=[9.086855095557066e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:22:15,858] [INFO] [timer.py:259:stop] epoch=0/micro_step=19220/global_step=19220, RunningAvgSamplesPerSec=2.6374730887113524, CurrSamplesPerSec=2.6303922959209385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:22:31,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=19230, skipped=0, lr=[9.085884195259089e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:22:31,147] [INFO] [timer.py:259:stop] epoch=0/micro_step=19230/global_step=19230, RunningAvgSamplesPerSec=2.6374687634118934, CurrSamplesPerSec=2.632103228292968, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:22:46,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=19240, skipped=0, lr=[9.084912831007646e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:22:46,314] [INFO] [timer.py:259:stop] epoch=0/micro_step=19240/global_step=19240, RunningAvgSamplesPerSec=2.637474795844888, CurrSamplesPerSec=2.6517049886506388, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:23:01,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=19250, skipped=0, lr=[9.083941002913031e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:23:01,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=19250/global_step=19250, RunningAvgSamplesPerSec=2.6374791506873545, CurrSamplesPerSec=2.6454652914232666, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:23:16,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=19260, skipped=0, lr=[9.082968711085598e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:23:16,733] [INFO] [timer.py:259:stop] epoch=0/micro_step=19260/global_step=19260, RunningAvgSamplesPerSec=2.6374805402108388, CurrSamplesPerSec=2.6197436364599755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:23:31,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=19270, skipped=0, lr=[9.081995955635748e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:23:31,974] [INFO] [timer.py:259:stop] epoch=0/micro_step=19270/global_step=19270, RunningAvgSamplesPerSec=2.637480196594663, CurrSamplesPerSec=2.6498199047658706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:23:47,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=19280, skipped=0, lr=[9.081022736673941e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:23:47,196] [INFO] [timer.py:259:stop] epoch=0/micro_step=19280/global_step=19280, RunningAvgSamplesPerSec=2.637482484152943, CurrSamplesPerSec=2.650703269446957, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:24:02,429] [INFO] [logging.py:96:log_dist] [Rank 0] step=19290, skipped=0, lr=[9.080049054310687e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:24:02,430] [INFO] [timer.py:259:stop] epoch=0/micro_step=19290/global_step=19290, RunningAvgSamplesPerSec=2.637484327133592, CurrSamplesPerSec=2.6548789494154343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:24:17,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=19300, skipped=0, lr=[9.079074908656544e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:24:17,619] [INFO] [timer.py:259:stop] epoch=0/micro_step=19300/global_step=19300, RunningAvgSamplesPerSec=2.6374899941592744, CurrSamplesPerSec=2.638221066999937, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:24:32,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=19310, skipped=0, lr=[9.07810029982213e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:24:32,879] [INFO] [timer.py:259:stop] epoch=0/micro_step=19310/global_step=19310, RunningAvgSamplesPerSec=2.6374900123686054, CurrSamplesPerSec=2.640501036693016, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:24:48,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=19320, skipped=0, lr=[9.077125227918111e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:24:48,046] [INFO] [timer.py:259:stop] epoch=0/micro_step=19320/global_step=19320, RunningAvgSamplesPerSec=2.637495476406455, CurrSamplesPerSec=2.6159578539019366, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:25:03,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=19330, skipped=0, lr=[9.076149693055205e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:25:03,309] [INFO] [timer.py:259:stop] epoch=0/micro_step=19330/global_step=19330, RunningAvgSamplesPerSec=2.637493609954138, CurrSamplesPerSec=2.6444061741498555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:25:18,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=19340, skipped=0, lr=[9.075173695344188e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:25:18,568] [INFO] [timer.py:259:stop] epoch=0/micro_step=19340/global_step=19340, RunningAvgSamplesPerSec=2.6374932641646276, CurrSamplesPerSec=2.6065212521181493, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:25:33,792] [INFO] [logging.py:96:log_dist] [Rank 0] step=19350, skipped=0, lr=[9.074197234895882e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:25:33,794] [INFO] [timer.py:259:stop] epoch=0/micro_step=19350/global_step=19350, RunningAvgSamplesPerSec=2.637497106011418, CurrSamplesPerSec=2.6523287746139035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:25:48,958] [INFO] [logging.py:96:log_dist] [Rank 0] step=19360, skipped=0, lr=[9.073220311821164e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:25:48,986] [INFO] [timer.py:259:stop] epoch=0/micro_step=19360/global_step=19360, RunningAvgSamplesPerSec=2.6375007756945292, CurrSamplesPerSec=2.6341356305997485, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:26:04,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=19370, skipped=0, lr=[9.072242926230967e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:26:04,189] [INFO] [timer.py:259:stop] epoch=0/micro_step=19370/global_step=19370, RunningAvgSamplesPerSec=2.6375055596033605, CurrSamplesPerSec=2.637489040738728, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:26:19,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=19380, skipped=0, lr=[9.071265078236271e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:26:19,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=19380/global_step=19380, RunningAvgSamplesPerSec=2.637502526815397, CurrSamplesPerSec=2.605756928525179, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:26:34,696] [INFO] [logging.py:96:log_dist] [Rank 0] step=19390, skipped=0, lr=[9.07028676794811e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:26:34,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=19390/global_step=19390, RunningAvgSamplesPerSec=2.6375029920861905, CurrSamplesPerSec=2.6370446315465017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:26:49,958] [INFO] [logging.py:96:log_dist] [Rank 0] step=19400, skipped=0, lr=[9.069307995477576e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:26:49,961] [INFO] [timer.py:259:stop] epoch=0/micro_step=19400/global_step=19400, RunningAvgSamplesPerSec=2.6375056730764976, CurrSamplesPerSec=2.6484094175584363, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:27:05,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=19410, skipped=0, lr=[9.068328760935803e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:27:05,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=19410/global_step=19410, RunningAvgSamplesPerSec=2.637507693996737, CurrSamplesPerSec=2.641604026284665, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:27:20,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=19420, skipped=0, lr=[9.067349064433987e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:27:20,433] [INFO] [timer.py:259:stop] epoch=0/micro_step=19420/global_step=19420, RunningAvgSamplesPerSec=2.6375075281692437, CurrSamplesPerSec=2.6396277801498145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:27:35,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=19430, skipped=0, lr=[9.066368906083374e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:27:35,612] [INFO] [timer.py:259:stop] epoch=0/micro_step=19430/global_step=19430, RunningAvgSamplesPerSec=2.6375135352060113, CurrSamplesPerSec=2.673620212561774, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:27:50,808] [INFO] [logging.py:96:log_dist] [Rank 0] step=19440, skipped=0, lr=[9.065388285995257e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:27:50,809] [INFO] [timer.py:259:stop] epoch=0/micro_step=19440/global_step=19440, RunningAvgSamplesPerSec=2.6375162916260866, CurrSamplesPerSec=2.6344971464990024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:28:05,977] [INFO] [logging.py:96:log_dist] [Rank 0] step=19450, skipped=0, lr=[9.064407204280989e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:28:05,979] [INFO] [timer.py:259:stop] epoch=0/micro_step=19450/global_step=19450, RunningAvgSamplesPerSec=2.637522716241409, CurrSamplesPerSec=2.648007711981763, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:28:21,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=19460, skipped=0, lr=[9.063425661051972e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:28:21,143] [INFO] [timer.py:259:stop] epoch=0/micro_step=19460/global_step=19460, RunningAvgSamplesPerSec=2.63752889968643, CurrSamplesPerSec=2.6529788638109038, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:28:36,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=19470, skipped=0, lr=[9.062443656419659e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:28:36,361] [INFO] [timer.py:259:stop] epoch=0/micro_step=19470/global_step=19470, RunningAvgSamplesPerSec=2.637532066757679, CurrSamplesPerSec=2.639282292947431, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:28:51,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=19480, skipped=0, lr=[9.061461190495558e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:28:51,643] [INFO] [timer.py:259:stop] epoch=0/micro_step=19480/global_step=19480, RunningAvgSamplesPerSec=2.6375301514141727, CurrSamplesPerSec=2.647807531709192, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:29:06,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=19490, skipped=0, lr=[9.060478263391228e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:29:06,835] [INFO] [timer.py:259:stop] epoch=0/micro_step=19490/global_step=19490, RunningAvgSamplesPerSec=2.637534259716916, CurrSamplesPerSec=2.6503640882448622, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:29:22,059] [INFO] [logging.py:96:log_dist] [Rank 0] step=19500, skipped=0, lr=[9.059494875218282e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:29:22,061] [INFO] [timer.py:259:stop] epoch=0/micro_step=19500/global_step=19500, RunningAvgSamplesPerSec=2.6375352728598775, CurrSamplesPerSec=2.635717685045681, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:29:37,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=19510, skipped=0, lr=[9.05851102608838e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:29:37,284] [INFO] [timer.py:259:stop] epoch=0/micro_step=19510/global_step=19510, RunningAvgSamplesPerSec=2.6375363587628162, CurrSamplesPerSec=2.637865993891664, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:29:52,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=19520, skipped=0, lr=[9.057526716113245e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:29:52,473] [INFO] [timer.py:259:stop] epoch=0/micro_step=19520/global_step=19520, RunningAvgSamplesPerSec=2.6375400492314247, CurrSamplesPerSec=2.653581004735103, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:30:07,631] [INFO] [logging.py:96:log_dist] [Rank 0] step=19530, skipped=0, lr=[9.056541945404639e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:30:07,651] [INFO] [timer.py:259:stop] epoch=0/micro_step=19530/global_step=19530, RunningAvgSamplesPerSec=2.6375447706185193, CurrSamplesPerSec=2.6584808654976073, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:30:22,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=19540, skipped=0, lr=[9.055556714074387e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:30:22,855] [INFO] [timer.py:259:stop] epoch=0/micro_step=19540/global_step=19540, RunningAvgSamplesPerSec=2.637547264768355, CurrSamplesPerSec=2.6496675734348996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:30:38,105] [INFO] [logging.py:96:log_dist] [Rank 0] step=19550, skipped=0, lr=[9.054571022234361e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:30:38,106] [INFO] [timer.py:259:stop] epoch=0/micro_step=19550/global_step=19550, RunningAvgSamplesPerSec=2.6375452897937457, CurrSamplesPerSec=2.6011686882945533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:30:53,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=19560, skipped=0, lr=[9.053584869996489e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:30:53,346] [INFO] [timer.py:259:stop] epoch=0/micro_step=19560/global_step=19560, RunningAvgSamplesPerSec=2.6375452126512116, CurrSamplesPerSec=2.6501183407997684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:31:08,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=19570, skipped=0, lr=[9.052598257472744e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:31:08,564] [INFO] [timer.py:259:stop] epoch=0/micro_step=19570/global_step=19570, RunningAvgSamplesPerSec=2.6375465558562445, CurrSamplesPerSec=2.6153563565228506, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:31:23,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=19580, skipped=0, lr=[9.051611184775162e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:31:23,766] [INFO] [timer.py:259:stop] epoch=0/micro_step=19580/global_step=19580, RunningAvgSamplesPerSec=2.6375487031199794, CurrSamplesPerSec=2.6541704023355632, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:31:39,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=19590, skipped=0, lr=[9.05062365201582e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:31:39,004] [INFO] [timer.py:259:stop] epoch=0/micro_step=19590/global_step=19590, RunningAvgSamplesPerSec=2.6375493594730144, CurrSamplesPerSec=2.62706183256409, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:31:54,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=19600, skipped=0, lr=[9.049635659306858e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:31:54,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=19600/global_step=19600, RunningAvgSamplesPerSec=2.6375556347551483, CurrSamplesPerSec=2.644732992296024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:32:09,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=19610, skipped=0, lr=[9.048647206760456e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:32:09,382] [INFO] [timer.py:259:stop] epoch=0/micro_step=19610/global_step=19610, RunningAvgSamplesPerSec=2.637558589106372, CurrSamplesPerSec=2.624097639222621, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:32:24,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=19620, skipped=0, lr=[9.04765829448886e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:32:24,573] [INFO] [timer.py:259:stop] epoch=0/micro_step=19620/global_step=19620, RunningAvgSamplesPerSec=2.63756296806751, CurrSamplesPerSec=2.6540603951639756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:32:39,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=19630, skipped=0, lr=[9.046668922604356e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:32:39,803] [INFO] [timer.py:259:stop] epoch=0/micro_step=19630/global_step=19630, RunningAvgSamplesPerSec=2.637563777327902, CurrSamplesPerSec=2.649867198000077, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:32:55,042] [INFO] [logging.py:96:log_dist] [Rank 0] step=19640, skipped=0, lr=[9.045679091219292e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:32:55,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=19640/global_step=19640, RunningAvgSamplesPerSec=2.6375647822176673, CurrSamplesPerSec=2.655682875298192, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:33:10,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=19650, skipped=0, lr=[9.044688800446061e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:33:10,293] [INFO] [timer.py:259:stop] epoch=0/micro_step=19650/global_step=19650, RunningAvgSamplesPerSec=2.637564229273527, CurrSamplesPerSec=2.63171594846864, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:33:25,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=19660, skipped=0, lr=[9.043698050397111e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:33:25,517] [INFO] [timer.py:259:stop] epoch=0/micro_step=19660/global_step=19660, RunningAvgSamplesPerSec=2.637566703925754, CurrSamplesPerSec=2.651342924975634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:33:40,760] [INFO] [logging.py:96:log_dist] [Rank 0] step=19670, skipped=0, lr=[9.042706841184942e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:33:40,762] [INFO] [timer.py:259:stop] epoch=0/micro_step=19670/global_step=19670, RunningAvgSamplesPerSec=2.6375665373707196, CurrSamplesPerSec=2.6392428500412013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:33:55,996] [INFO] [logging.py:96:log_dist] [Rank 0] step=19680, skipped=0, lr=[9.04171517292211e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:33:55,997] [INFO] [timer.py:259:stop] epoch=0/micro_step=19680/global_step=19680, RunningAvgSamplesPerSec=2.63756779949939, CurrSamplesPerSec=2.6435824007052457, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:34:11,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=19690, skipped=0, lr=[9.040723045721211e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:34:11,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=19690/global_step=19690, RunningAvgSamplesPerSec=2.6375666723714364, CurrSamplesPerSec=2.646405029602369, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:34:26,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=19700, skipped=0, lr=[9.039730459694909e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:34:26,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=19700/global_step=19700, RunningAvgSamplesPerSec=2.6375648788955246, CurrSamplesPerSec=2.6659424124421855, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:34:41,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=19710, skipped=0, lr=[9.03873741495591e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:34:41,701] [INFO] [timer.py:259:stop] epoch=0/micro_step=19710/global_step=19710, RunningAvgSamplesPerSec=2.6375686230565982, CurrSamplesPerSec=2.651431336493604, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:34:56,891] [INFO] [logging.py:96:log_dist] [Rank 0] step=19720, skipped=0, lr=[9.037743911616972e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:34:56,899] [INFO] [timer.py:259:stop] epoch=0/micro_step=19720/global_step=19720, RunningAvgSamplesPerSec=2.6375728405447316, CurrSamplesPerSec=2.641054287212199, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:35:12,089] [INFO] [logging.py:96:log_dist] [Rank 0] step=19730, skipped=0, lr=[9.036749949790912e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:35:12,099] [INFO] [timer.py:259:stop] epoch=0/micro_step=19730/global_step=19730, RunningAvgSamplesPerSec=2.637577033172298, CurrSamplesPerSec=2.640215565315588, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:35:27,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=19740, skipped=0, lr=[9.03575552959059e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:35:27,348] [INFO] [timer.py:259:stop] epoch=0/micro_step=19740/global_step=19740, RunningAvgSamplesPerSec=2.6375773899286385, CurrSamplesPerSec=2.649839575198056, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:35:42,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=19750, skipped=0, lr=[9.034760651128927e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:35:42,582] [INFO] [timer.py:259:stop] epoch=0/micro_step=19750/global_step=19750, RunningAvgSamplesPerSec=2.6375783030515163, CurrSamplesPerSec=2.6442448791503477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:35:57,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=19760, skipped=0, lr=[9.033765314518889e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:35:57,724] [INFO] [timer.py:259:stop] epoch=0/micro_step=19760/global_step=19760, RunningAvgSamplesPerSec=2.637587038012333, CurrSamplesPerSec=2.650894254097464, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:36:12,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=19770, skipped=0, lr=[9.0327695198735e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:36:12,952] [INFO] [timer.py:259:stop] epoch=0/micro_step=19770/global_step=19770, RunningAvgSamplesPerSec=2.637588562778291, CurrSamplesPerSec=2.6438519352077847, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:36:28,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=19780, skipped=0, lr=[9.031773267305827e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:36:28,184] [INFO] [timer.py:259:stop] epoch=0/micro_step=19780/global_step=19780, RunningAvgSamplesPerSec=2.637589279133897, CurrSamplesPerSec=2.60706845598578, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:36:43,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=19790, skipped=0, lr=[9.030776556929002e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:36:43,470] [INFO] [timer.py:259:stop] epoch=0/micro_step=19790/global_step=19790, RunningAvgSamplesPerSec=2.6375882418396936, CurrSamplesPerSec=2.6400946637347595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:36:58,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=19800, skipped=0, lr=[9.029779388856198e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:36:58,718] [INFO] [timer.py:259:stop] epoch=0/micro_step=19800/global_step=19800, RunningAvgSamplesPerSec=2.6375873675170323, CurrSamplesPerSec=2.579943942062276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:37:13,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=19810, skipped=0, lr=[9.02878176320064e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:37:13,931] [INFO] [timer.py:259:stop] epoch=0/micro_step=19810/global_step=19810, RunningAvgSamplesPerSec=2.63759069470743, CurrSamplesPerSec=2.6454173209276886, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:37:29,120] [INFO] [logging.py:96:log_dist] [Rank 0] step=19820, skipped=0, lr=[9.027783680075617e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:37:29,135] [INFO] [timer.py:259:stop] epoch=0/micro_step=19820/global_step=19820, RunningAvgSamplesPerSec=2.637595077261563, CurrSamplesPerSec=2.6399081397179502, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:37:44,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=19830, skipped=0, lr=[9.026785139594454e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:37:44,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=19830/global_step=19830, RunningAvgSamplesPerSec=2.637598812143087, CurrSamplesPerSec=2.630556854760582, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:37:59,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=19840, skipped=0, lr=[9.02578614187054e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:37:59,588] [INFO] [timer.py:259:stop] epoch=0/micro_step=19840/global_step=19840, RunningAvgSamplesPerSec=2.6375979023298246, CurrSamplesPerSec=2.6341769889482607, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:38:14,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=19850, skipped=0, lr=[9.024786687017313e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:38:14,829] [INFO] [timer.py:259:stop] epoch=0/micro_step=19850/global_step=19850, RunningAvgSamplesPerSec=2.6375989695689186, CurrSamplesPerSec=2.650337292442625, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:38:30,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=19860, skipped=0, lr=[9.023786775148256e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:38:30,080] [INFO] [timer.py:259:stop] epoch=0/micro_step=19860/global_step=19860, RunningAvgSamplesPerSec=2.6375993267642253, CurrSamplesPerSec=2.649490991450468, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:38:45,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=19870, skipped=0, lr=[9.022786406376912e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:38:45,295] [INFO] [timer.py:259:stop] epoch=0/micro_step=19870/global_step=19870, RunningAvgSamplesPerSec=2.637601991415342, CurrSamplesPerSec=2.6531538131786854, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:39:00,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=19880, skipped=0, lr=[9.021785580816875e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:39:00,527] [INFO] [timer.py:259:stop] epoch=0/micro_step=19880/global_step=19880, RunningAvgSamplesPerSec=2.637603642298742, CurrSamplesPerSec=2.6343192715858046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:39:15,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=19890, skipped=0, lr=[9.020784298581787e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:39:15,710] [INFO] [timer.py:259:stop] epoch=0/micro_step=19890/global_step=19890, RunningAvgSamplesPerSec=2.637608347384018, CurrSamplesPerSec=2.64896180493018, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:39:30,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=19900, skipped=0, lr=[9.019782559785342e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:39:30,966] [INFO] [timer.py:259:stop] epoch=0/micro_step=19900/global_step=19900, RunningAvgSamplesPerSec=2.6376096818054675, CurrSamplesPerSec=2.633863938542243, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:39:46,197] [INFO] [logging.py:96:log_dist] [Rank 0] step=19910, skipped=0, lr=[9.018780364541292e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:39:46,204] [INFO] [timer.py:259:stop] epoch=0/micro_step=19910/global_step=19910, RunningAvgSamplesPerSec=2.6376100375580456, CurrSamplesPerSec=2.637537553420321, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:40:01,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=19920, skipped=0, lr=[9.017777712963436e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:40:01,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=19920/global_step=19920, RunningAvgSamplesPerSec=2.637613060572977, CurrSamplesPerSec=2.6348451061783327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:40:16,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=19930, skipped=0, lr=[9.016774605165622e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:40:16,605] [INFO] [timer.py:259:stop] epoch=0/micro_step=19930/global_step=19930, RunningAvgSamplesPerSec=2.6376151905633405, CurrSamplesPerSec=2.6439977650351363, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:40:31,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=19940, skipped=0, lr=[9.015771041261756e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:40:31,802] [INFO] [timer.py:259:stop] epoch=0/micro_step=19940/global_step=19940, RunningAvgSamplesPerSec=2.6376179566704527, CurrSamplesPerSec=2.645532453039913, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:40:47,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=19950, skipped=0, lr=[9.014767021365793e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:40:47,014] [INFO] [timer.py:259:stop] epoch=0/micro_step=19950/global_step=19950, RunningAvgSamplesPerSec=2.637619625935586, CurrSamplesPerSec=2.643502009246385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:41:02,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=19960, skipped=0, lr=[9.013762545591737e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:41:02,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=19960/global_step=19960, RunningAvgSamplesPerSec=2.6376199977370103, CurrSamplesPerSec=2.648417779010763, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:41:17,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=19970, skipped=0, lr=[9.01275761405365e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:41:17,529] [INFO] [timer.py:259:stop] epoch=0/micro_step=19970/global_step=19970, RunningAvgSamplesPerSec=2.637618078919555, CurrSamplesPerSec=2.631918244606301, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:41:32,759] [INFO] [logging.py:96:log_dist] [Rank 0] step=19980, skipped=0, lr=[9.011752226865641e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:41:32,760] [INFO] [timer.py:259:stop] epoch=0/micro_step=19980/global_step=19980, RunningAvgSamplesPerSec=2.637618530557439, CurrSamplesPerSec=2.652468831419597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:41:47,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=19990, skipped=0, lr=[9.010746384141873e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:41:47,951] [INFO] [timer.py:259:stop] epoch=0/micro_step=19990/global_step=19990, RunningAvgSamplesPerSec=2.637623068467737, CurrSamplesPerSec=2.6485992355307424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:42:03,171] [INFO] [logging.py:96:log_dist] [Rank 0] step=20000, skipped=0, lr=[9.009740085996557e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:42:03,172] [INFO] [timer.py:259:stop] epoch=0/micro_step=20000/global_step=20000, RunningAvgSamplesPerSec=2.6376254384925044, CurrSamplesPerSec=2.649796049527195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:42:18,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=20010, skipped=0, lr=[9.008733332543962e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:42:18,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=20010/global_step=20010, RunningAvgSamplesPerSec=2.6376279716242528, CurrSamplesPerSec=2.6458032198539856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:42:33,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=20020, skipped=0, lr=[9.007726123898402e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:42:33,621] [INFO] [timer.py:259:stop] epoch=0/micro_step=20020/global_step=20020, RunningAvgSamplesPerSec=2.6376300337370657, CurrSamplesPerSec=2.6516278742107566, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:42:48,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=20030, skipped=0, lr=[9.00671846017425e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:42:48,877] [INFO] [timer.py:259:stop] epoch=0/micro_step=20030/global_step=20030, RunningAvgSamplesPerSec=2.637629847283993, CurrSamplesPerSec=2.607724106940771, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:43:04,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=20040, skipped=0, lr=[9.00571034148592e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:43:04,142] [INFO] [timer.py:259:stop] epoch=0/micro_step=20040/global_step=20040, RunningAvgSamplesPerSec=2.6376290641278763, CurrSamplesPerSec=2.6382799785587108, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:43:19,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=20050, skipped=0, lr=[9.00470176794789e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:43:19,419] [INFO] [timer.py:259:stop] epoch=0/micro_step=20050/global_step=20050, RunningAvgSamplesPerSec=2.6376277405339494, CurrSamplesPerSec=2.639429695294798, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:43:34,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=20060, skipped=0, lr=[9.00369273967468e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:43:34,610] [INFO] [timer.py:259:stop] epoch=0/micro_step=20060/global_step=20060, RunningAvgSamplesPerSec=2.6376326183922654, CurrSamplesPerSec=2.6400252853537722, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:43:49,841] [INFO] [logging.py:96:log_dist] [Rank 0] step=20070, skipped=0, lr=[9.00268325678087e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:43:49,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=20070/global_step=20070, RunningAvgSamplesPerSec=2.6376338363707537, CurrSamplesPerSec=2.6474456959623085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:44:05,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=20080, skipped=0, lr=[9.001673319381083e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:44:05,036] [INFO] [timer.py:259:stop] epoch=0/micro_step=20080/global_step=20080, RunningAvgSamplesPerSec=2.6376382498559123, CurrSamplesPerSec=2.6376826874818833, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:44:20,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=20090, skipped=0, lr=[9.00066292759e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:44:20,320] [INFO] [timer.py:259:stop] epoch=0/micro_step=20090/global_step=20090, RunningAvgSamplesPerSec=2.637635083784398, CurrSamplesPerSec=2.638058866373289, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:44:35,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=20100, skipped=0, lr=[8.99965208152235e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:44:35,487] [INFO] [timer.py:259:stop] epoch=0/micro_step=20100/global_step=20100, RunningAvgSamplesPerSec=2.6376408239438365, CurrSamplesPerSec=2.6398512322923358, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:44:50,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=20110, skipped=0, lr=[8.998640781292916e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:44:50,806] [INFO] [timer.py:259:stop] epoch=0/micro_step=20110/global_step=20110, RunningAvgSamplesPerSec=2.637635413038398, CurrSamplesPerSec=2.649150447179043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:45:05,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=20120, skipped=0, lr=[8.99762902701653e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:45:05,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=20120/global_step=20120, RunningAvgSamplesPerSec=2.637639286553902, CurrSamplesPerSec=2.6380099197182494, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:45:21,162] [INFO] [logging.py:96:log_dist] [Rank 0] step=20130, skipped=0, lr=[8.996616818808076e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:45:21,164] [INFO] [timer.py:259:stop] epoch=0/micro_step=20130/global_step=20130, RunningAvgSamplesPerSec=2.6376454572828067, CurrSamplesPerSec=2.667614661543371, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:45:36,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=20140, skipped=0, lr=[8.995604156782495e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:45:36,408] [INFO] [timer.py:259:stop] epoch=0/micro_step=20140/global_step=20140, RunningAvgSamplesPerSec=2.637645687103688, CurrSamplesPerSec=2.6541263143969975, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:45:51,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=20150, skipped=0, lr=[8.994591041054773e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:45:51,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=20150/global_step=20150, RunningAvgSamplesPerSec=2.6376505870413682, CurrSamplesPerSec=2.6385326643101323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:46:06,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=20160, skipped=0, lr=[8.99357747173995e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:46:06,797] [INFO] [timer.py:259:stop] epoch=0/micro_step=20160/global_step=20160, RunningAvgSamplesPerSec=2.637653851423226, CurrSamplesPerSec=2.6433562340992687, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:46:22,039] [INFO] [logging.py:96:log_dist] [Rank 0] step=20170, skipped=0, lr=[8.992563448953113e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:46:22,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=20170/global_step=20170, RunningAvgSamplesPerSec=2.6376536107378135, CurrSamplesPerSec=2.6350387787224765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:46:37,275] [INFO] [logging.py:96:log_dist] [Rank 0] step=20180, skipped=0, lr=[8.99154897280941e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:46:37,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=20180/global_step=20180, RunningAvgSamplesPerSec=2.6376547548679583, CurrSamplesPerSec=2.6502535590527474, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:46:52,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=20190, skipped=0, lr=[8.990534043424033e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:46:52,457] [INFO] [timer.py:259:stop] epoch=0/micro_step=20190/global_step=20190, RunningAvgSamplesPerSec=2.6376602972362977, CurrSamplesPerSec=2.650436104648253, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:47:07,703] [INFO] [logging.py:96:log_dist] [Rank 0] step=20200, skipped=0, lr=[8.989518660912226e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:47:07,711] [INFO] [timer.py:259:stop] epoch=0/micro_step=20200/global_step=20200, RunningAvgSamplesPerSec=2.6376607385264235, CurrSamplesPerSec=2.643886099610105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:47:22,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=20210, skipped=0, lr=[8.98850282538929e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:47:22,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=20210/global_step=20210, RunningAvgSamplesPerSec=2.637662793261624, CurrSamplesPerSec=2.6527703811493817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:47:38,186] [INFO] [logging.py:96:log_dist] [Rank 0] step=20220, skipped=0, lr=[8.987486536970569e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:47:38,188] [INFO] [timer.py:259:stop] epoch=0/micro_step=20220/global_step=20220, RunningAvgSamplesPerSec=2.637664496710764, CurrSamplesPerSec=2.621739372197911, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:47:53,451] [INFO] [logging.py:96:log_dist] [Rank 0] step=20230, skipped=0, lr=[8.986469795771467e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:47:53,454] [INFO] [timer.py:259:stop] epoch=0/micro_step=20230/global_step=20230, RunningAvgSamplesPerSec=2.637664555211317, CurrSamplesPerSec=2.6216115540393017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:48:08,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=20240, skipped=0, lr=[8.985452601907432e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:48:08,651] [INFO] [timer.py:259:stop] epoch=0/micro_step=20240/global_step=20240, RunningAvgSamplesPerSec=2.6376689734364307, CurrSamplesPerSec=2.6460110261191816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:48:23,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=20250, skipped=0, lr=[8.984434955493966e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:48:23,859] [INFO] [timer.py:259:stop] epoch=0/micro_step=20250/global_step=20250, RunningAvgSamplesPerSec=2.6376722186763963, CurrSamplesPerSec=2.6401141900444336, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:48:39,124] [INFO] [logging.py:96:log_dist] [Rank 0] step=20260, skipped=0, lr=[8.983416856646627e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:48:39,126] [INFO] [timer.py:259:stop] epoch=0/micro_step=20260/global_step=20260, RunningAvgSamplesPerSec=2.6376706192348913, CurrSamplesPerSec=2.5994618770755817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:48:54,325] [INFO] [logging.py:96:log_dist] [Rank 0] step=20270, skipped=0, lr=[8.98239830548102e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:48:54,326] [INFO] [timer.py:259:stop] epoch=0/micro_step=20270/global_step=20270, RunningAvgSamplesPerSec=2.6376729486323898, CurrSamplesPerSec=2.64105470296466, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:49:09,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=20280, skipped=0, lr=[8.981379302112797e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:49:09,592] [INFO] [timer.py:259:stop] epoch=0/micro_step=20280/global_step=20280, RunningAvgSamplesPerSec=2.637671210201366, CurrSamplesPerSec=2.636408955682309, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:49:24,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=20290, skipped=0, lr=[8.980359846657672e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:49:24,809] [INFO] [timer.py:259:stop] epoch=0/micro_step=20290/global_step=20290, RunningAvgSamplesPerSec=2.6376737181933185, CurrSamplesPerSec=2.6471207130636354, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:49:40,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=20300, skipped=0, lr=[8.9793399392314e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:49:40,048] [INFO] [timer.py:259:stop] epoch=0/micro_step=20300/global_step=20300, RunningAvgSamplesPerSec=2.6376750839871166, CurrSamplesPerSec=2.6491872585554797, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:49:55,223] [INFO] [logging.py:96:log_dist] [Rank 0] step=20310, skipped=0, lr=[8.978319579949795e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:49:55,239] [INFO] [timer.py:259:stop] epoch=0/micro_step=20310/global_step=20310, RunningAvgSamplesPerSec=2.6376791803109594, CurrSamplesPerSec=2.6296716140876435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:50:10,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=20320, skipped=0, lr=[8.977298768928716e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:50:10,459] [INFO] [timer.py:259:stop] epoch=0/micro_step=20320/global_step=20320, RunningAvgSamplesPerSec=2.637680760342494, CurrSamplesPerSec=2.6195943343982098, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:50:25,677] [INFO] [logging.py:96:log_dist] [Rank 0] step=20330, skipped=0, lr=[8.976277506284079e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:50:25,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=20330/global_step=20330, RunningAvgSamplesPerSec=2.6376823095269635, CurrSamplesPerSec=2.651918753208213, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:50:40,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=20340, skipped=0, lr=[8.97525579213185e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:50:40,888] [INFO] [timer.py:259:stop] epoch=0/micro_step=20340/global_step=20340, RunningAvgSamplesPerSec=2.637684401338002, CurrSamplesPerSec=2.6648435585553196, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:50:56,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=20350, skipped=0, lr=[8.97423362658804e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:50:56,086] [INFO] [timer.py:259:stop] epoch=0/micro_step=20350/global_step=20350, RunningAvgSamplesPerSec=2.637687101967347, CurrSamplesPerSec=2.6535436514591146, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:51:11,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=20360, skipped=0, lr=[8.97321100976872e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:51:11,285] [INFO] [timer.py:259:stop] epoch=0/micro_step=20360/global_step=20360, RunningAvgSamplesPerSec=2.637691645457967, CurrSamplesPerSec=2.6604677337886287, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:51:26,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=20370, skipped=0, lr=[8.97218794179001e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:51:26,501] [INFO] [timer.py:259:stop] epoch=0/micro_step=20370/global_step=20370, RunningAvgSamplesPerSec=2.637693605055871, CurrSamplesPerSec=2.6479157673884526, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:51:41,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=20380, skipped=0, lr=[8.971164422768076e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:51:41,734] [INFO] [timer.py:259:stop] epoch=0/micro_step=20380/global_step=20380, RunningAvgSamplesPerSec=2.63769438936393, CurrSamplesPerSec=2.6462848127121426, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:51:56,938] [INFO] [logging.py:96:log_dist] [Rank 0] step=20390, skipped=0, lr=[8.970140452819138e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:51:56,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=20390/global_step=20390, RunningAvgSamplesPerSec=2.6376961986585057, CurrSamplesPerSec=2.649063024259917, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:52:12,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=20400, skipped=0, lr=[8.969116032059473e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:52:12,179] [INFO] [timer.py:259:stop] epoch=0/micro_step=20400/global_step=20400, RunningAvgSamplesPerSec=2.6376967702657597, CurrSamplesPerSec=2.6474223012119817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:52:27,385] [INFO] [logging.py:96:log_dist] [Rank 0] step=20410, skipped=0, lr=[8.9680911606054e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:52:27,393] [INFO] [timer.py:259:stop] epoch=0/micro_step=20410/global_step=20410, RunningAvgSamplesPerSec=2.6377005946540866, CurrSamplesPerSec=2.640564621693294, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:52:42,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=20420, skipped=0, lr=[8.967065838573297e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:52:42,604] [INFO] [timer.py:259:stop] epoch=0/micro_step=20420/global_step=20420, RunningAvgSamplesPerSec=2.6377024895955152, CurrSamplesPerSec=2.6471319900540413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:52:57,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=20430, skipped=0, lr=[8.966040066079586e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:52:57,825] [INFO] [timer.py:259:stop] epoch=0/micro_step=20430/global_step=20430, RunningAvgSamplesPerSec=2.6377035390369765, CurrSamplesPerSec=2.634977528771384, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:53:13,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=20440, skipped=0, lr=[8.965013843240745e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:53:13,035] [INFO] [timer.py:259:stop] epoch=0/micro_step=20440/global_step=20440, RunningAvgSamplesPerSec=2.637706160377416, CurrSamplesPerSec=2.635623693616141, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:53:28,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=20450, skipped=0, lr=[8.963987170173305e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:53:28,181] [INFO] [timer.py:259:stop] epoch=0/micro_step=20450/global_step=20450, RunningAvgSamplesPerSec=2.6377132530893257, CurrSamplesPerSec=2.636721368228881, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:53:43,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=20460, skipped=0, lr=[8.962960046993843e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:53:43,347] [INFO] [timer.py:259:stop] epoch=0/micro_step=20460/global_step=20460, RunningAvgSamplesPerSec=2.6377190850251777, CurrSamplesPerSec=2.6477419259716766, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:53:58,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=20470, skipped=0, lr=[8.961932473818986e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:53:58,577] [INFO] [timer.py:259:stop] epoch=0/micro_step=20470/global_step=20470, RunningAvgSamplesPerSec=2.637720869928028, CurrSamplesPerSec=2.599409116504217, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:54:13,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=20480, skipped=0, lr=[8.96090445076542e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:54:13,792] [INFO] [timer.py:259:stop] epoch=0/micro_step=20480/global_step=20480, RunningAvgSamplesPerSec=2.6377232869731033, CurrSamplesPerSec=2.6348496579840672, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:54:29,024] [INFO] [logging.py:96:log_dist] [Rank 0] step=20490, skipped=0, lr=[8.959875977949874e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:54:29,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=20490/global_step=20490, RunningAvgSamplesPerSec=2.6377235945386928, CurrSamplesPerSec=2.608302228524644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:54:44,247] [INFO] [logging.py:96:log_dist] [Rank 0] step=20500, skipped=0, lr=[8.958847055489133e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:54:44,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=20500/global_step=20500, RunningAvgSamplesPerSec=2.6377249708756514, CurrSamplesPerSec=2.6530564763903643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:54:59,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=20510, skipped=0, lr=[8.957817683500032e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:54:59,478] [INFO] [timer.py:259:stop] epoch=0/micro_step=20510/global_step=20510, RunningAvgSamplesPerSec=2.6377246258206393, CurrSamplesPerSec=2.639420975261146, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:55:14,675] [INFO] [logging.py:96:log_dist] [Rank 0] step=20520, skipped=0, lr=[8.956787862099456e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:55:14,676] [INFO] [timer.py:259:stop] epoch=0/micro_step=20520/global_step=20520, RunningAvgSamplesPerSec=2.6377291714445934, CurrSamplesPerSec=2.651402005004678, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:55:29,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=20530, skipped=0, lr=[8.955757591404342e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:55:29,872] [INFO] [timer.py:259:stop] epoch=0/micro_step=20530/global_step=20530, RunningAvgSamplesPerSec=2.6377323983918073, CurrSamplesPerSec=2.634894763090935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:55:45,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=20540, skipped=0, lr=[8.954726871531675e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:55:45,070] [INFO] [timer.py:259:stop] epoch=0/micro_step=20540/global_step=20540, RunningAvgSamplesPerSec=2.6377367319906178, CurrSamplesPerSec=2.649042110541262, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:56:00,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=20550, skipped=0, lr=[8.953695702598497e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:56:00,325] [INFO] [timer.py:259:stop] epoch=0/micro_step=20550/global_step=20550, RunningAvgSamplesPerSec=2.6377355199096226, CurrSamplesPerSec=2.6361753161783685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:56:15,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=20560, skipped=0, lr=[8.952664084721895e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:56:15,501] [INFO] [timer.py:259:stop] epoch=0/micro_step=20560/global_step=20560, RunningAvgSamplesPerSec=2.637740427465974, CurrSamplesPerSec=2.647978874126596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:56:30,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=20570, skipped=0, lr=[8.95163201801901e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:56:30,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=20570/global_step=20570, RunningAvgSamplesPerSec=2.637739764180885, CurrSamplesPerSec=2.6429731303745108, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:56:46,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=20580, skipped=0, lr=[8.950599502607036e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:56:46,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=20580/global_step=20580, RunningAvgSamplesPerSec=2.6377408427244786, CurrSamplesPerSec=2.6445424775113247, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:57:01,221] [INFO] [logging.py:96:log_dist] [Rank 0] step=20590, skipped=0, lr=[8.949566538603211e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:57:01,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=20590/global_step=20590, RunningAvgSamplesPerSec=2.637744067558808, CurrSamplesPerSec=2.6468985341821374, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:57:16,488] [INFO] [logging.py:96:log_dist] [Rank 0] step=20600, skipped=0, lr=[8.948533126124835e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:57:16,490] [INFO] [timer.py:259:stop] epoch=0/micro_step=20600/global_step=20600, RunningAvgSamplesPerSec=2.637741640659676, CurrSamplesPerSec=2.6365759255913916, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:57:31,721] [INFO] [logging.py:96:log_dist] [Rank 0] step=20610, skipped=0, lr=[8.947499265289244e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:57:31,724] [INFO] [timer.py:259:stop] epoch=0/micro_step=20610/global_step=20610, RunningAvgSamplesPerSec=2.637741027355733, CurrSamplesPerSec=2.632465838714679, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:57:46,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=20620, skipped=0, lr=[8.94646495621384e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:57:46,965] [INFO] [timer.py:259:stop] epoch=0/micro_step=20620/global_step=20620, RunningAvgSamplesPerSec=2.63774140075458, CurrSamplesPerSec=2.654610942261407, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:58:02,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=20630, skipped=0, lr=[8.945430199016066e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:58:02,156] [INFO] [timer.py:259:stop] epoch=0/micro_step=20630/global_step=20630, RunningAvgSamplesPerSec=2.63774572480216, CurrSamplesPerSec=2.644210288728446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:58:17,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=20640, skipped=0, lr=[8.944394993813419e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:58:17,384] [INFO] [timer.py:259:stop] epoch=0/micro_step=20640/global_step=20640, RunningAvgSamplesPerSec=2.6377456563978523, CurrSamplesPerSec=2.6349415250614356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:58:32,628] [INFO] [logging.py:96:log_dist] [Rank 0] step=20650, skipped=0, lr=[8.94335934072345e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:58:32,630] [INFO] [timer.py:259:stop] epoch=0/micro_step=20650/global_step=20650, RunningAvgSamplesPerSec=2.637745690994507, CurrSamplesPerSec=2.6333418020778216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:58:47,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=20660, skipped=0, lr=[8.942323239863753e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:58:47,824] [INFO] [timer.py:259:stop] epoch=0/micro_step=20660/global_step=20660, RunningAvgSamplesPerSec=2.6377495567954234, CurrSamplesPerSec=2.651221421135207, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:59:03,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=20670, skipped=0, lr=[8.94128669135198e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:59:03,019] [INFO] [timer.py:259:stop] epoch=0/micro_step=20670/global_step=20670, RunningAvgSamplesPerSec=2.637753982326295, CurrSamplesPerSec=2.648209176548436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:59:18,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=20680, skipped=0, lr=[8.940249695305831e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:59:18,246] [INFO] [timer.py:259:stop] epoch=0/micro_step=20680/global_step=20680, RunningAvgSamplesPerSec=2.637756295861234, CurrSamplesPerSec=2.624775846364044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:59:33,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=20690, skipped=0, lr=[8.93921225184306e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:59:33,476] [INFO] [timer.py:259:stop] epoch=0/micro_step=20690/global_step=20690, RunningAvgSamplesPerSec=2.6377571158086304, CurrSamplesPerSec=2.637069501238724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 08:59:48,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=20700, skipped=0, lr=[8.938174361081464e-06], mom=[(0.9, 0.95)] +[2024-11-01 08:59:48,739] [INFO] [timer.py:259:stop] epoch=0/micro_step=20700/global_step=20700, RunningAvgSamplesPerSec=2.637755067986379, CurrSamplesPerSec=2.5964772268323584, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:00:03,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=20710, skipped=0, lr=[8.937136023138898e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:00:03,962] [INFO] [timer.py:259:stop] epoch=0/micro_step=20710/global_step=20710, RunningAvgSamplesPerSec=2.6377572789752333, CurrSamplesPerSec=2.654899535274707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:00:19,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=20720, skipped=0, lr=[8.936097238133268e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:00:19,175] [INFO] [timer.py:259:stop] epoch=0/micro_step=20720/global_step=20720, RunningAvgSamplesPerSec=2.6377602627182926, CurrSamplesPerSec=2.649749177326881, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:00:34,323] [INFO] [logging.py:96:log_dist] [Rank 0] step=20730, skipped=0, lr=[8.935058006182526e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:00:34,324] [INFO] [timer.py:259:stop] epoch=0/micro_step=20730/global_step=20730, RunningAvgSamplesPerSec=2.637766677012539, CurrSamplesPerSec=2.6342448194513794, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:00:49,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=20740, skipped=0, lr=[8.934018327404677e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:00:49,541] [INFO] [timer.py:259:stop] epoch=0/micro_step=20740/global_step=20740, RunningAvgSamplesPerSec=2.6377689327190503, CurrSamplesPerSec=2.644170281658302, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:01:04,755] [INFO] [logging.py:96:log_dist] [Rank 0] step=20750, skipped=0, lr=[8.932978201917776e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:01:04,757] [INFO] [timer.py:259:stop] epoch=0/micro_step=20750/global_step=20750, RunningAvgSamplesPerSec=2.6377712520568553, CurrSamplesPerSec=2.646119949669371, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:01:19,969] [INFO] [logging.py:96:log_dist] [Rank 0] step=20760, skipped=0, lr=[8.931937629839933e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:01:19,970] [INFO] [timer.py:259:stop] epoch=0/micro_step=20760/global_step=20760, RunningAvgSamplesPerSec=2.637772888805861, CurrSamplesPerSec=2.6542993154724863, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:01:35,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=20770, skipped=0, lr=[8.930896611289302e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:01:35,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=20770/global_step=20770, RunningAvgSamplesPerSec=2.6377753869301652, CurrSamplesPerSec=2.6291527863607844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:01:50,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=20780, skipped=0, lr=[8.929855146384093e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:01:50,473] [INFO] [timer.py:259:stop] epoch=0/micro_step=20780/global_step=20780, RunningAvgSamplesPerSec=2.6377709244214533, CurrSamplesPerSec=2.6350619551222096, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:02:05,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=20790, skipped=0, lr=[8.928813235242565e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:02:05,655] [INFO] [timer.py:259:stop] epoch=0/micro_step=20790/global_step=20790, RunningAvgSamplesPerSec=2.637774693373937, CurrSamplesPerSec=2.653306965256761, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:02:20,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=20800, skipped=0, lr=[8.927770877983024e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:02:20,880] [INFO] [timer.py:259:stop] epoch=0/micro_step=20800/global_step=20800, RunningAvgSamplesPerSec=2.6377759369848977, CurrSamplesPerSec=2.6462505862895678, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:02:36,075] [INFO] [logging.py:96:log_dist] [Rank 0] step=20810, skipped=0, lr=[8.926728074723834e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:02:36,076] [INFO] [timer.py:259:stop] epoch=0/micro_step=20810/global_step=20810, RunningAvgSamplesPerSec=2.6377808944390058, CurrSamplesPerSec=2.6376586355904323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:02:51,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=20820, skipped=0, lr=[8.925684825583403e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:02:51,269] [INFO] [timer.py:259:stop] epoch=0/micro_step=20820/global_step=20820, RunningAvgSamplesPerSec=2.6377855203980634, CurrSamplesPerSec=2.656931963769696, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:03:06,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=20830, skipped=0, lr=[8.924641130680192e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:03:06,447] [INFO] [timer.py:259:stop] epoch=0/micro_step=20830/global_step=20830, RunningAvgSamplesPerSec=2.637790525315321, CurrSamplesPerSec=2.6536817379120827, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:03:21,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=20840, skipped=0, lr=[8.923596990132717e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:03:21,702] [INFO] [timer.py:259:stop] epoch=0/micro_step=20840/global_step=20840, RunningAvgSamplesPerSec=2.6377880129350473, CurrSamplesPerSec=2.644540810107029, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:03:36,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=20850, skipped=0, lr=[8.922552404059534e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:03:36,904] [INFO] [timer.py:259:stop] epoch=0/micro_step=20850/global_step=20850, RunningAvgSamplesPerSec=2.6377901264092594, CurrSamplesPerSec=2.652423961311725, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:03:52,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=20860, skipped=0, lr=[8.921507372579263e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:03:52,115] [INFO] [timer.py:259:stop] epoch=0/micro_step=20860/global_step=20860, RunningAvgSamplesPerSec=2.637792448444436, CurrSamplesPerSec=2.6442457126656564, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:04:07,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=20870, skipped=0, lr=[8.920461895810563e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:04:07,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=20870/global_step=20870, RunningAvgSamplesPerSec=2.637791763022594, CurrSamplesPerSec=2.656064206351881, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:04:22,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=20880, skipped=0, lr=[8.919415973872149e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:04:22,612] [INFO] [timer.py:259:stop] epoch=0/micro_step=20880/global_step=20880, RunningAvgSamplesPerSec=2.6377910902755715, CurrSamplesPerSec=2.6482041604609807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:04:37,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=20890, skipped=0, lr=[8.918369606882787e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:04:37,801] [INFO] [timer.py:259:stop] epoch=0/micro_step=20890/global_step=20890, RunningAvgSamplesPerSec=2.6377943546555827, CurrSamplesPerSec=2.6439256814241534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:04:53,057] [INFO] [logging.py:96:log_dist] [Rank 0] step=20900, skipped=0, lr=[8.917322794961292e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:04:53,059] [INFO] [timer.py:259:stop] epoch=0/micro_step=20900/global_step=20900, RunningAvgSamplesPerSec=2.6377944079060374, CurrSamplesPerSec=2.636869727962164, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:05:08,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=20910, skipped=0, lr=[8.916275538226528e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:05:08,328] [INFO] [timer.py:259:stop] epoch=0/micro_step=20910/global_step=20910, RunningAvgSamplesPerSec=2.6377928942377102, CurrSamplesPerSec=2.61711962853246, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:05:23,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=20920, skipped=0, lr=[8.915227836797414e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:05:23,479] [INFO] [timer.py:259:stop] epoch=0/micro_step=20920/global_step=20920, RunningAvgSamplesPerSec=2.6377993708959404, CurrSamplesPerSec=2.653345570762355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:05:38,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=20930, skipped=0, lr=[8.914179690792915e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:05:38,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=20930/global_step=20930, RunningAvgSamplesPerSec=2.6378027898982266, CurrSamplesPerSec=2.603224265919134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:05:53,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=20940, skipped=0, lr=[8.91313110033205e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:05:53,897] [INFO] [timer.py:259:stop] epoch=0/micro_step=20940/global_step=20940, RunningAvgSamplesPerSec=2.6378044312847235, CurrSamplesPerSec=2.649011158843661, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:06:09,157] [INFO] [logging.py:96:log_dist] [Rank 0] step=20950, skipped=0, lr=[8.912082065533885e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:06:09,159] [INFO] [timer.py:259:stop] epoch=0/micro_step=20950/global_step=20950, RunningAvgSamplesPerSec=2.6378034778956514, CurrSamplesPerSec=2.6003526859903396, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:06:24,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=20960, skipped=0, lr=[8.911032586517538e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:06:24,365] [INFO] [timer.py:259:stop] epoch=0/micro_step=20960/global_step=20960, RunningAvgSamplesPerSec=2.6378062787240832, CurrSamplesPerSec=2.6416406282351748, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:06:39,550] [INFO] [logging.py:96:log_dist] [Rank 0] step=20970, skipped=0, lr=[8.909982663402181e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:06:39,559] [INFO] [timer.py:259:stop] epoch=0/micro_step=20970/global_step=20970, RunningAvgSamplesPerSec=2.6378115281316203, CurrSamplesPerSec=2.643492845763583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:06:54,770] [INFO] [logging.py:96:log_dist] [Rank 0] step=20980, skipped=0, lr=[8.908932296307029e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:06:54,780] [INFO] [timer.py:259:stop] epoch=0/micro_step=20980/global_step=20980, RunningAvgSamplesPerSec=2.6378139979662953, CurrSamplesPerSec=2.6325938913177263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:07:09,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=20990, skipped=0, lr=[8.907881485351352e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:07:09,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=20990/global_step=20990, RunningAvgSamplesPerSec=2.6378158397750706, CurrSamplesPerSec=2.638507767012768, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:07:25,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=21000, skipped=0, lr=[8.906830230654476e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:07:25,221] [INFO] [timer.py:259:stop] epoch=0/micro_step=21000/global_step=21000, RunningAvgSamplesPerSec=2.637816936340952, CurrSamplesPerSec=2.6484625136775444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:07:40,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=21010, skipped=0, lr=[8.905778532335762e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:07:40,446] [INFO] [timer.py:259:stop] epoch=0/micro_step=21010/global_step=21010, RunningAvgSamplesPerSec=2.6378190789257294, CurrSamplesPerSec=2.6429352424542722, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:07:55,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=21020, skipped=0, lr=[8.904726390514636e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:07:55,692] [INFO] [timer.py:259:stop] epoch=0/micro_step=21020/global_step=21020, RunningAvgSamplesPerSec=2.6378189431697177, CurrSamplesPerSec=2.625042791535083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:08:10,900] [INFO] [logging.py:96:log_dist] [Rank 0] step=21030, skipped=0, lr=[8.90367380531057e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:08:10,901] [INFO] [timer.py:259:stop] epoch=0/micro_step=21030/global_step=21030, RunningAvgSamplesPerSec=2.6378220316587737, CurrSamplesPerSec=2.641054287212199, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:08:26,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=21040, skipped=0, lr=[8.902620776843082e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:08:26,122] [INFO] [timer.py:259:stop] epoch=0/micro_step=21040/global_step=21040, RunningAvgSamplesPerSec=2.6378241360695727, CurrSamplesPerSec=2.639192613941327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:08:41,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=21050, skipped=0, lr=[8.901567305231746e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:08:41,404] [INFO] [timer.py:259:stop] epoch=0/micro_step=21050/global_step=21050, RunningAvgSamplesPerSec=2.6378219399524525, CurrSamplesPerSec=2.599692679406136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:08:56,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=21060, skipped=0, lr=[8.900513390596184e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:08:56,602] [INFO] [timer.py:259:stop] epoch=0/micro_step=21060/global_step=21060, RunningAvgSamplesPerSec=2.6378252789552614, CurrSamplesPerSec=2.632629417795696, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:09:11,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=21070, skipped=0, lr=[8.899459033056065e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:09:11,836] [INFO] [timer.py:259:stop] epoch=0/micro_step=21070/global_step=21070, RunningAvgSamplesPerSec=2.637826921791865, CurrSamplesPerSec=2.6417175788251677, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:09:27,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=21080, skipped=0, lr=[8.898404232731117e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:09:27,038] [INFO] [timer.py:259:stop] epoch=0/micro_step=21080/global_step=21080, RunningAvgSamplesPerSec=2.6378298171424324, CurrSamplesPerSec=2.65172929739297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:09:42,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=21090, skipped=0, lr=[8.89734898974111e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:09:42,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=21090/global_step=21090, RunningAvgSamplesPerSec=2.6378286986254067, CurrSamplesPerSec=2.6503858602331474, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:09:57,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=21100, skipped=0, lr=[8.896293304205868e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:09:57,477] [INFO] [timer.py:259:stop] epoch=0/micro_step=21100/global_step=21100, RunningAvgSamplesPerSec=2.637833402186511, CurrSamplesPerSec=2.635702778468611, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:10:12,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=21110, skipped=0, lr=[8.895237176245263e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:10:12,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=21110/global_step=21110, RunningAvgSamplesPerSec=2.6378337300553003, CurrSamplesPerSec=2.641941384937777, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:10:27,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=21120, skipped=0, lr=[8.89418060597922e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:10:27,937] [INFO] [timer.py:259:stop] epoch=0/micro_step=21120/global_step=21120, RunningAvgSamplesPerSec=2.637835308548324, CurrSamplesPerSec=2.6456208946561066, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:10:43,172] [INFO] [logging.py:96:log_dist] [Rank 0] step=21130, skipped=0, lr=[8.893123593527712e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:10:43,175] [INFO] [timer.py:259:stop] epoch=0/micro_step=21130/global_step=21130, RunningAvgSamplesPerSec=2.6378359290835807, CurrSamplesPerSec=2.6452946912215345, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:10:58,452] [INFO] [logging.py:96:log_dist] [Rank 0] step=21140, skipped=0, lr=[8.892066139010762e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:10:58,453] [INFO] [timer.py:259:stop] epoch=0/micro_step=21140/global_step=21140, RunningAvgSamplesPerSec=2.6378363217667733, CurrSamplesPerSec=2.6180005223312914, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:11:13,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=21150, skipped=0, lr=[8.891008242548449e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:11:13,690] [INFO] [timer.py:259:stop] epoch=0/micro_step=21150/global_step=21150, RunningAvgSamplesPerSec=2.637838536794411, CurrSamplesPerSec=2.6520688281336073, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:11:28,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=21160, skipped=0, lr=[8.889949904260892e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:11:28,904] [INFO] [timer.py:259:stop] epoch=0/micro_step=21160/global_step=21160, RunningAvgSamplesPerSec=2.63784023621581, CurrSamplesPerSec=2.5982231731761325, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:11:44,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=21170, skipped=0, lr=[8.88889112426827e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:11:44,129] [INFO] [timer.py:259:stop] epoch=0/micro_step=21170/global_step=21170, RunningAvgSamplesPerSec=2.637840968341263, CurrSamplesPerSec=2.6432375433549313, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:11:59,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=21180, skipped=0, lr=[8.887831902690804e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:11:59,354] [INFO] [timer.py:259:stop] epoch=0/micro_step=21180/global_step=21180, RunningAvgSamplesPerSec=2.637841523151207, CurrSamplesPerSec=2.6085268974978537, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:12:14,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=21190, skipped=0, lr=[8.886772239648773e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:12:14,546] [INFO] [timer.py:259:stop] epoch=0/micro_step=21190/global_step=21190, RunningAvgSamplesPerSec=2.6378440848492515, CurrSamplesPerSec=2.6417982777730935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:12:29,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=21200, skipped=0, lr=[8.885712135262498e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:12:29,779] [INFO] [timer.py:259:stop] epoch=0/micro_step=21200/global_step=21200, RunningAvgSamplesPerSec=2.6378436562586933, CurrSamplesPerSec=2.6462171955110017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:12:44,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=21210, skipped=0, lr=[8.884651589652358e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:12:44,989] [INFO] [timer.py:259:stop] epoch=0/micro_step=21210/global_step=21210, RunningAvgSamplesPerSec=2.63784564960658, CurrSamplesPerSec=2.6410842217239843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:13:00,194] [INFO] [logging.py:96:log_dist] [Rank 0] step=21220, skipped=0, lr=[8.883590602938774e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:13:00,195] [INFO] [timer.py:259:stop] epoch=0/micro_step=21220/global_step=21220, RunningAvgSamplesPerSec=2.6378483685510123, CurrSamplesPerSec=2.6497127688156032, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:13:15,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=21230, skipped=0, lr=[8.882529175242226e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:13:15,411] [INFO] [timer.py:259:stop] epoch=0/micro_step=21230/global_step=21230, RunningAvgSamplesPerSec=2.6378506352616857, CurrSamplesPerSec=2.6367470606187076, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:13:30,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=21240, skipped=0, lr=[8.881467306683236e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:13:30,653] [INFO] [timer.py:259:stop] epoch=0/micro_step=21240/global_step=21240, RunningAvgSamplesPerSec=2.6378515636671693, CurrSamplesPerSec=2.652299003965623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:13:45,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=21250, skipped=0, lr=[8.88040499738238e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:13:45,886] [INFO] [timer.py:259:stop] epoch=0/micro_step=21250/global_step=21250, RunningAvgSamplesPerSec=2.6378510949884344, CurrSamplesPerSec=2.640747081929677, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:14:01,145] [INFO] [logging.py:96:log_dist] [Rank 0] step=21260, skipped=0, lr=[8.879342247460286e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:14:01,146] [INFO] [timer.py:259:stop] epoch=0/micro_step=21260/global_step=21260, RunningAvgSamplesPerSec=2.6378517382185924, CurrSamplesPerSec=2.6415740799886747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:14:16,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=21270, skipped=0, lr=[8.878279057037627e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:14:16,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=21270/global_step=21270, RunningAvgSamplesPerSec=2.637854409679318, CurrSamplesPerSec=2.653019557407903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:14:31,586] [INFO] [logging.py:96:log_dist] [Rank 0] step=21280, skipped=0, lr=[8.877215426235129e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:14:31,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=21280/global_step=21280, RunningAvgSamplesPerSec=2.637854983034302, CurrSamplesPerSec=2.638104496246877, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:14:46,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=21290, skipped=0, lr=[8.876151355173569e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:14:46,790] [INFO] [timer.py:259:stop] epoch=0/micro_step=21290/global_step=21290, RunningAvgSamplesPerSec=2.6378576150870257, CurrSamplesPerSec=2.6458645568620853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:15:02,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=21300, skipped=0, lr=[8.87508684397377e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:15:02,032] [INFO] [timer.py:259:stop] epoch=0/micro_step=21300/global_step=21300, RunningAvgSamplesPerSec=2.6378573580946036, CurrSamplesPerSec=2.638014067669422, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:15:17,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=21310, skipped=0, lr=[8.87402189275661e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:15:17,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=21310/global_step=21310, RunningAvgSamplesPerSec=2.6378580880307405, CurrSamplesPerSec=2.64782550072676, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:15:32,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=21320, skipped=0, lr=[8.872956501643016e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:15:32,520] [INFO] [timer.py:259:stop] epoch=0/micro_step=21320/global_step=21320, RunningAvgSamplesPerSec=2.6378581930610316, CurrSamplesPerSec=2.6432816867376925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:15:47,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=21330, skipped=0, lr=[8.87189067075396e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:15:47,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=21330/global_step=21330, RunningAvgSamplesPerSec=2.637860958593818, CurrSamplesPerSec=2.6420042072235717, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:16:02,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=21340, skipped=0, lr=[8.870824400210468e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:16:02,983] [INFO] [timer.py:259:stop] epoch=0/micro_step=21340/global_step=21340, RunningAvgSamplesPerSec=2.637860340347287, CurrSamplesPerSec=2.64123764674793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:16:18,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=21350, skipped=0, lr=[8.869757690133616e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:16:18,159] [INFO] [timer.py:259:stop] epoch=0/micro_step=21350/global_step=21350, RunningAvgSamplesPerSec=2.6378642878413356, CurrSamplesPerSec=2.605248707259144, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:16:33,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=21360, skipped=0, lr=[8.86869054064453e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:16:33,385] [INFO] [timer.py:259:stop] epoch=0/micro_step=21360/global_step=21360, RunningAvgSamplesPerSec=2.637864935657197, CurrSamplesPerSec=2.63853100447569, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:16:48,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=21370, skipped=0, lr=[8.867622951864386e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:16:48,624] [INFO] [timer.py:259:stop] epoch=0/micro_step=21370/global_step=21370, RunningAvgSamplesPerSec=2.637864324897661, CurrSamplesPerSec=2.5813062559588174, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:17:03,830] [INFO] [logging.py:96:log_dist] [Rank 0] step=21380, skipped=0, lr=[8.866554923914406e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:17:03,838] [INFO] [timer.py:259:stop] epoch=0/micro_step=21380/global_step=21380, RunningAvgSamplesPerSec=2.6378658174385863, CurrSamplesPerSec=2.647104841906576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:17:19,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=21390, skipped=0, lr=[8.865486456915865e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:17:19,069] [INFO] [timer.py:259:stop] epoch=0/micro_step=21390/global_step=21390, RunningAvgSamplesPerSec=2.6378662939541373, CurrSamplesPerSec=2.592384155630887, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:17:34,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=21400, skipped=0, lr=[8.864417550990093e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:17:34,296] [INFO] [timer.py:259:stop] epoch=0/micro_step=21400/global_step=21400, RunningAvgSamplesPerSec=2.6378674019564117, CurrSamplesPerSec=2.6419097669043152, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:17:49,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=21410, skipped=0, lr=[8.86334820625846e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:17:49,572] [INFO] [timer.py:259:stop] epoch=0/micro_step=21410/global_step=21410, RunningAvgSamplesPerSec=2.637865035901555, CurrSamplesPerSec=2.6329330843732355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:18:04,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=21420, skipped=0, lr=[8.862278422842391e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:18:04,790] [INFO] [timer.py:259:stop] epoch=0/micro_step=21420/global_step=21420, RunningAvgSamplesPerSec=2.637866040378182, CurrSamplesPerSec=2.6557278557344994, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:18:20,005] [INFO] [logging.py:96:log_dist] [Rank 0] step=21430, skipped=0, lr=[8.861208200863361e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:18:20,006] [INFO] [timer.py:259:stop] epoch=0/micro_step=21430/global_step=21430, RunningAvgSamplesPerSec=2.6378681422847374, CurrSamplesPerSec=2.652363158447808, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:18:35,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=21440, skipped=0, lr=[8.860137540442892e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:18:35,145] [INFO] [timer.py:259:stop] epoch=0/micro_step=21440/global_step=21440, RunningAvgSamplesPerSec=2.637875503934646, CurrSamplesPerSec=2.6739559967900277, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:18:50,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=21450, skipped=0, lr=[8.859066441702562e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:18:50,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=21450/global_step=21450, RunningAvgSamplesPerSec=2.637878048000298, CurrSamplesPerSec=2.6313663384726156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:19:05,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=21460, skipped=0, lr=[8.857994904763991e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:19:05,567] [INFO] [timer.py:259:stop] epoch=0/micro_step=21460/global_step=21460, RunningAvgSamplesPerSec=2.6378809263078224, CurrSamplesPerSec=2.6498584088642385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:19:20,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=21470, skipped=0, lr=[8.856922929748854e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:19:20,808] [INFO] [timer.py:259:stop] epoch=0/micro_step=21470/global_step=21470, RunningAvgSamplesPerSec=2.637881501528994, CurrSamplesPerSec=2.656827197086926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:19:36,029] [INFO] [logging.py:96:log_dist] [Rank 0] step=21480, skipped=0, lr=[8.855850516778873e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:19:36,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=21480/global_step=21480, RunningAvgSamplesPerSec=2.6378812255420496, CurrSamplesPerSec=2.638609848918062, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:19:51,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=21490, skipped=0, lr=[8.854777665975822e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:19:51,252] [INFO] [timer.py:259:stop] epoch=0/micro_step=21490/global_step=21490, RunningAvgSamplesPerSec=2.637882702538358, CurrSamplesPerSec=2.6376495125739368, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:20:06,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=21500, skipped=0, lr=[8.853704377461523e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:20:06,445] [INFO] [timer.py:259:stop] epoch=0/micro_step=21500/global_step=21500, RunningAvgSamplesPerSec=2.6378860329850675, CurrSamplesPerSec=2.6410168700266867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:20:21,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=21510, skipped=0, lr=[8.852630651357848e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:20:21,689] [INFO] [timer.py:259:stop] epoch=0/micro_step=21510/global_step=21510, RunningAvgSamplesPerSec=2.637885413354698, CurrSamplesPerSec=2.6521053014212193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:20:36,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=21520, skipped=0, lr=[8.85155648778672e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:20:36,937] [INFO] [timer.py:259:stop] epoch=0/micro_step=21520/global_step=21520, RunningAvgSamplesPerSec=2.6378851236297884, CurrSamplesPerSec=2.6279197955747873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:20:52,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=21530, skipped=0, lr=[8.850481886870109e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:20:52,118] [INFO] [timer.py:259:stop] epoch=0/micro_step=21530/global_step=21530, RunningAvgSamplesPerSec=2.6378893850637346, CurrSamplesPerSec=2.6361003449017573, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:21:07,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=21540, skipped=0, lr=[8.849406848730037e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:21:07,317] [INFO] [timer.py:259:stop] epoch=0/micro_step=21540/global_step=21540, RunningAvgSamplesPerSec=2.6378920744168126, CurrSamplesPerSec=2.6473638161450386, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:21:22,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=21550, skipped=0, lr=[8.848331373488574e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:21:22,544] [INFO] [timer.py:259:stop] epoch=0/micro_step=21550/global_step=21550, RunningAvgSamplesPerSec=2.63789303124107, CurrSamplesPerSec=2.6380124084873877, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:21:37,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=21560, skipped=0, lr=[8.847255461267844e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:21:37,801] [INFO] [timer.py:259:stop] epoch=0/micro_step=21560/global_step=21560, RunningAvgSamplesPerSec=2.6378910101126767, CurrSamplesPerSec=2.6283393108947175, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:21:53,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=21570, skipped=0, lr=[8.846179112190015e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:21:53,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=21570/global_step=21570, RunningAvgSamplesPerSec=2.637892342589359, CurrSamplesPerSec=2.6418174132983117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:22:08,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=21580, skipped=0, lr=[8.845102326377304e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:22:08,278] [INFO] [timer.py:259:stop] epoch=0/micro_step=21580/global_step=21580, RunningAvgSamplesPerSec=2.637894357881306, CurrSamplesPerSec=2.652889929573354, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:22:23,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=21590, skipped=0, lr=[8.844025103951985e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:22:23,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=21590/global_step=21590, RunningAvgSamplesPerSec=2.6378942856822603, CurrSamplesPerSec=2.636775239944569, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:22:38,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=21600, skipped=0, lr=[8.842947445036374e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:22:38,750] [INFO] [timer.py:259:stop] epoch=0/micro_step=21600/global_step=21600, RunningAvgSamplesPerSec=2.637894782858376, CurrSamplesPerSec=2.6212777288585354, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:22:53,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=21610, skipped=0, lr=[8.841869349752841e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:22:53,952] [INFO] [timer.py:259:stop] epoch=0/micro_step=21610/global_step=21610, RunningAvgSamplesPerSec=2.637898307210749, CurrSamplesPerSec=2.6438252709219006, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:23:09,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=21620, skipped=0, lr=[8.840790818223803e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:23:09,161] [INFO] [timer.py:259:stop] epoch=0/micro_step=21620/global_step=21620, RunningAvgSamplesPerSec=2.6379013616345577, CurrSamplesPerSec=2.6401486733818387, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:23:24,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=21630, skipped=0, lr=[8.839711850571728e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:23:24,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=21630/global_step=21630, RunningAvgSamplesPerSec=2.6379043316809203, CurrSamplesPerSec=2.637267231981501, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:23:39,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=21640, skipped=0, lr=[8.838632446919135e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:23:39,612] [INFO] [timer.py:259:stop] epoch=0/micro_step=21640/global_step=21640, RunningAvgSamplesPerSec=2.637904146182624, CurrSamplesPerSec=2.636588770306558, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:23:54,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=21650, skipped=0, lr=[8.837552607388587e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:23:54,817] [INFO] [timer.py:259:stop] epoch=0/micro_step=21650/global_step=21650, RunningAvgSamplesPerSec=2.6379064591004244, CurrSamplesPerSec=2.6486213966711927, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:24:10,041] [INFO] [logging.py:96:log_dist] [Rank 0] step=21660, skipped=0, lr=[8.836472332102702e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:24:10,043] [INFO] [timer.py:259:stop] epoch=0/micro_step=21660/global_step=21660, RunningAvgSamplesPerSec=2.6379083383862505, CurrSamplesPerSec=2.6420229296746807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:24:25,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=21670, skipped=0, lr=[8.835391621184147e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:24:25,259] [INFO] [timer.py:259:stop] epoch=0/micro_step=21670/global_step=21670, RunningAvgSamplesPerSec=2.6379108602510857, CurrSamplesPerSec=2.6446000042471383, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:24:40,444] [INFO] [logging.py:96:log_dist] [Rank 0] step=21680, skipped=0, lr=[8.834310474755635e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:24:40,452] [INFO] [timer.py:259:stop] epoch=0/micro_step=21680/global_step=21680, RunningAvgSamplesPerSec=2.6379146561155737, CurrSamplesPerSec=2.647521731756477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:24:55,682] [INFO] [logging.py:96:log_dist] [Rank 0] step=21690, skipped=0, lr=[8.83322889293993e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:24:55,689] [INFO] [timer.py:259:stop] epoch=0/micro_step=21690/global_step=21690, RunningAvgSamplesPerSec=2.6379148080506587, CurrSamplesPerSec=2.6407686962020263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:25:10,972] [INFO] [logging.py:96:log_dist] [Rank 0] step=21700, skipped=0, lr=[8.832146875859849e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:25:10,975] [INFO] [timer.py:259:stop] epoch=0/micro_step=21700/global_step=21700, RunningAvgSamplesPerSec=2.6379135580231514, CurrSamplesPerSec=2.6473834501293783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:25:26,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=21710, skipped=0, lr=[8.831064423638252e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:25:26,228] [INFO] [timer.py:259:stop] epoch=0/micro_step=21710/global_step=21710, RunningAvgSamplesPerSec=2.637913251366479, CurrSamplesPerSec=2.6535524650586564, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:25:41,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=21720, skipped=0, lr=[8.829981536398054e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:25:41,479] [INFO] [timer.py:259:stop] epoch=0/micro_step=21720/global_step=21720, RunningAvgSamplesPerSec=2.6379130261193318, CurrSamplesPerSec=2.633826311336817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:25:56,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=21730, skipped=0, lr=[8.828898214262218e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:25:56,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=21730/global_step=21730, RunningAvgSamplesPerSec=2.637911831324902, CurrSamplesPerSec=2.6374297498847072, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:26:11,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=21740, skipped=0, lr=[8.827814457353751e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:26:11,974] [INFO] [timer.py:259:stop] epoch=0/micro_step=21740/global_step=21740, RunningAvgSamplesPerSec=2.6379115007385696, CurrSamplesPerSec=2.65451853854729, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:26:27,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=21750, skipped=0, lr=[8.82673026579572e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:26:27,197] [INFO] [timer.py:259:stop] epoch=0/micro_step=21750/global_step=21750, RunningAvgSamplesPerSec=2.6379118826946355, CurrSamplesPerSec=2.635642739795962, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:26:42,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=21760, skipped=0, lr=[8.82564563971123e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:26:42,411] [INFO] [timer.py:259:stop] epoch=0/micro_step=21760/global_step=21760, RunningAvgSamplesPerSec=2.637914428457867, CurrSamplesPerSec=2.6525627700191756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:26:57,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=21770, skipped=0, lr=[8.824560579223444e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:26:57,600] [INFO] [timer.py:259:stop] epoch=0/micro_step=21770/global_step=21770, RunningAvgSamplesPerSec=2.6379179209619736, CurrSamplesPerSec=2.650415169243783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:27:12,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=21780, skipped=0, lr=[8.823475084455571e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:27:12,786] [INFO] [timer.py:259:stop] epoch=0/micro_step=21780/global_step=21780, RunningAvgSamplesPerSec=2.637922576254444, CurrSamplesPerSec=2.662435603382666, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:27:28,061] [INFO] [logging.py:96:log_dist] [Rank 0] step=21790, skipped=0, lr=[8.822389155530869e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:27:28,063] [INFO] [timer.py:259:stop] epoch=0/micro_step=21790/global_step=21790, RunningAvgSamplesPerSec=2.6379206947791256, CurrSamplesPerSec=2.6412006401651844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:27:43,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=21800, skipped=0, lr=[8.821302792572642e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:27:43,253] [INFO] [timer.py:259:stop] epoch=0/micro_step=21800/global_step=21800, RunningAvgSamplesPerSec=2.6379240390982868, CurrSamplesPerSec=2.6572096985710183, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:27:58,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=21810, skipped=0, lr=[8.820215995704252e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:27:58,420] [INFO] [timer.py:259:stop] epoch=0/micro_step=21810/global_step=21810, RunningAvgSamplesPerSec=2.637929007082411, CurrSamplesPerSec=2.62220117816951, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:28:13,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=21820, skipped=0, lr=[8.8191287650491e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:28:13,634] [INFO] [timer.py:259:stop] epoch=0/micro_step=21820/global_step=21820, RunningAvgSamplesPerSec=2.637930873671143, CurrSamplesPerSec=2.647104841906576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:28:28,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=21830, skipped=0, lr=[8.818041100730647e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:28:28,886] [INFO] [timer.py:259:stop] epoch=0/micro_step=21830/global_step=21830, RunningAvgSamplesPerSec=2.6379302749776867, CurrSamplesPerSec=2.616754703033131, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:28:44,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=21840, skipped=0, lr=[8.816953002872396e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:28:44,131] [INFO] [timer.py:259:stop] epoch=0/micro_step=21840/global_step=21840, RunningAvgSamplesPerSec=2.6379308653965663, CurrSamplesPerSec=2.653611223885188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:28:59,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=21850, skipped=0, lr=[8.815864471597897e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:28:59,376] [INFO] [timer.py:259:stop] epoch=0/micro_step=21850/global_step=21850, RunningAvgSamplesPerSec=2.63793229248475, CurrSamplesPerSec=2.634483908468875, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:29:14,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=21860, skipped=0, lr=[8.814775507030757e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:29:14,609] [INFO] [timer.py:259:stop] epoch=0/micro_step=21860/global_step=21860, RunningAvgSamplesPerSec=2.63793391303513, CurrSamplesPerSec=2.6521145247103397, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:29:29,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=21870, skipped=0, lr=[8.813686109294626e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:29:29,846] [INFO] [timer.py:259:stop] epoch=0/micro_step=21870/global_step=21870, RunningAvgSamplesPerSec=2.637933645076639, CurrSamplesPerSec=2.6516848714078205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:29:45,072] [INFO] [logging.py:96:log_dist] [Rank 0] step=21880, skipped=0, lr=[8.812596278513208e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:29:45,073] [INFO] [timer.py:259:stop] epoch=0/micro_step=21880/global_step=21880, RunningAvgSamplesPerSec=2.6379355224243732, CurrSamplesPerSec=2.644712563795249, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:30:00,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=21890, skipped=0, lr=[8.81150601481025e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:30:00,279] [INFO] [timer.py:259:stop] epoch=0/micro_step=21890/global_step=21890, RunningAvgSamplesPerSec=2.637937215099747, CurrSamplesPerSec=2.642679631468792, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:30:15,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=21900, skipped=0, lr=[8.810415318309556e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:30:15,518] [INFO] [timer.py:259:stop] epoch=0/micro_step=21900/global_step=21900, RunningAvgSamplesPerSec=2.6379382938847558, CurrSamplesPerSec=2.6270721165720103, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:30:30,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=21910, skipped=0, lr=[8.809324189134974e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:30:30,747] [INFO] [timer.py:259:stop] epoch=0/micro_step=21910/global_step=21910, RunningAvgSamplesPerSec=2.6379395844677216, CurrSamplesPerSec=2.6506324948442335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:30:45,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=21920, skipped=0, lr=[8.808232627410398e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:30:45,952] [INFO] [timer.py:259:stop] epoch=0/micro_step=21920/global_step=21920, RunningAvgSamplesPerSec=2.6379432878963187, CurrSamplesPerSec=2.644837224253612, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:31:01,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=21930, skipped=0, lr=[8.80714063325978e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:31:01,199] [INFO] [timer.py:259:stop] epoch=0/micro_step=21930/global_step=21930, RunningAvgSamplesPerSec=2.6379438106261506, CurrSamplesPerSec=2.648142296944749, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:31:16,367] [INFO] [logging.py:96:log_dist] [Rank 0] step=21940, skipped=0, lr=[8.806048206807114e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:31:16,368] [INFO] [timer.py:259:stop] epoch=0/micro_step=21940/global_step=21940, RunningAvgSamplesPerSec=2.637949683601591, CurrSamplesPerSec=2.6533728470685634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:31:31,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=21950, skipped=0, lr=[8.804955348176445e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:31:31,532] [INFO] [timer.py:259:stop] epoch=0/micro_step=21950/global_step=21950, RunningAvgSamplesPerSec=2.6379557028715763, CurrSamplesPerSec=2.653682157649653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:31:46,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=21960, skipped=0, lr=[8.80386205749187e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:31:46,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=21960/global_step=21960, RunningAvgSamplesPerSec=2.6379584289251414, CurrSamplesPerSec=2.638309435324732, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:32:02,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=21970, skipped=0, lr=[8.802768334877529e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:32:02,012] [INFO] [timer.py:259:stop] epoch=0/micro_step=21970/global_step=21970, RunningAvgSamplesPerSec=2.637956513698508, CurrSamplesPerSec=2.638469592069376, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:32:17,242] [INFO] [logging.py:96:log_dist] [Rank 0] step=21980, skipped=0, lr=[8.80167418045762e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:32:17,251] [INFO] [timer.py:259:stop] epoch=0/micro_step=21980/global_step=21980, RunningAvgSamplesPerSec=2.637956315057024, CurrSamplesPerSec=2.6393067896615974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:32:32,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=21990, skipped=0, lr=[8.800579594356376e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:32:32,523] [INFO] [timer.py:259:stop] epoch=0/micro_step=21990/global_step=21990, RunningAvgSamplesPerSec=2.637954383446738, CurrSamplesPerSec=2.6411619714731827, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:32:47,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=22000, skipped=0, lr=[8.799484576698097e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:32:47,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=22000/global_step=22000, RunningAvgSamplesPerSec=2.6379562889533723, CurrSamplesPerSec=2.6417367131813347, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:33:02,944] [INFO] [logging.py:96:log_dist] [Rank 0] step=22010, skipped=0, lr=[8.798389127607115e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:33:02,945] [INFO] [timer.py:259:stop] epoch=0/micro_step=22010/global_step=22010, RunningAvgSamplesPerSec=2.637959188138849, CurrSamplesPerSec=2.6434095442883114, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:33:18,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=22020, skipped=0, lr=[8.797293247207823e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:33:18,169] [INFO] [timer.py:259:stop] epoch=0/micro_step=22020/global_step=22020, RunningAvgSamplesPerSec=2.6379600715548315, CurrSamplesPerSec=2.6279844226719784, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:33:33,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=22030, skipped=0, lr=[8.796196935624656e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:33:33,415] [INFO] [timer.py:259:stop] epoch=0/micro_step=22030/global_step=22030, RunningAvgSamplesPerSec=2.637959793680968, CurrSamplesPerSec=2.6379667817993013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:33:48,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=22040, skipped=0, lr=[8.795100192982105e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:33:48,626] [INFO] [timer.py:259:stop] epoch=0/micro_step=22040/global_step=22040, RunningAvgSamplesPerSec=2.6379606696593374, CurrSamplesPerSec=2.6130967925633755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:34:03,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=22050, skipped=0, lr=[8.7940030194047e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:34:03,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=22050/global_step=22050, RunningAvgSamplesPerSec=2.637963151549883, CurrSamplesPerSec=2.660945395565328, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:34:19,024] [INFO] [logging.py:96:log_dist] [Rank 0] step=22060, skipped=0, lr=[8.792905415017029e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:34:19,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=22060/global_step=22060, RunningAvgSamplesPerSec=2.637967375086885, CurrSamplesPerSec=2.6346642880725364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:34:34,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=22070, skipped=0, lr=[8.791807379943722e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:34:34,200] [INFO] [timer.py:259:stop] epoch=0/micro_step=22070/global_step=22070, RunningAvgSamplesPerSec=2.637972903852163, CurrSamplesPerSec=2.637117169460005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:34:49,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=22080, skipped=0, lr=[8.790708914309465e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:34:49,478] [INFO] [timer.py:259:stop] epoch=0/micro_step=22080/global_step=22080, RunningAvgSamplesPerSec=2.6379706429391314, CurrSamplesPerSec=2.5875211577661714, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:35:04,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=22090, skipped=0, lr=[8.789610018238986e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:35:04,687] [INFO] [timer.py:259:stop] epoch=0/micro_step=22090/global_step=22090, RunningAvgSamplesPerSec=2.6379732480778637, CurrSamplesPerSec=2.6524227032931957, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:35:19,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=22100, skipped=0, lr=[8.788510691857068e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:35:19,860] [INFO] [timer.py:259:stop] epoch=0/micro_step=22100/global_step=22100, RunningAvgSamplesPerSec=2.6379788197301166, CurrSamplesPerSec=2.65427411973806, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:35:35,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=22110, skipped=0, lr=[8.787410935288538e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:35:35,074] [INFO] [timer.py:259:stop] epoch=0/micro_step=22110/global_step=22110, RunningAvgSamplesPerSec=2.637982655156702, CurrSamplesPerSec=2.6399422022705497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:35:50,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=22120, skipped=0, lr=[8.786310748658275e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:35:50,325] [INFO] [timer.py:259:stop] epoch=0/micro_step=22120/global_step=22120, RunningAvgSamplesPerSec=2.637982894978896, CurrSamplesPerSec=2.6360572694121402, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:36:05,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=22130, skipped=0, lr=[8.785210132091203e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:36:05,558] [INFO] [timer.py:259:stop] epoch=0/micro_step=22130/global_step=22130, RunningAvgSamplesPerSec=2.637983650630198, CurrSamplesPerSec=2.638794944386045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:36:20,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=22140, skipped=0, lr=[8.7841090857123e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:36:20,777] [INFO] [timer.py:259:stop] epoch=0/micro_step=22140/global_step=22140, RunningAvgSamplesPerSec=2.637984889242062, CurrSamplesPerSec=2.664674682174172, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:36:36,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=22150, skipped=0, lr=[8.783007609646588e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:36:36,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=22150/global_step=22150, RunningAvgSamplesPerSec=2.637983905358284, CurrSamplesPerSec=2.639680524735302, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:36:51,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=22160, skipped=0, lr=[8.781905704019142e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:36:51,231] [INFO] [timer.py:259:stop] epoch=0/micro_step=22160/global_step=22160, RunningAvgSamplesPerSec=2.637986057840503, CurrSamplesPerSec=2.6507807489370943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:37:06,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=22170, skipped=0, lr=[8.780803368955082e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:37:06,502] [INFO] [timer.py:259:stop] epoch=0/micro_step=22170/global_step=22170, RunningAvgSamplesPerSec=2.6379869202380033, CurrSamplesPerSec=2.6470622413207496, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:37:21,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=22180, skipped=0, lr=[8.77970060457958e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:37:21,739] [INFO] [timer.py:259:stop] epoch=0/micro_step=22180/global_step=22180, RunningAvgSamplesPerSec=2.6379869889782035, CurrSamplesPerSec=2.653456777682138, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:37:36,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=22190, skipped=0, lr=[8.778597411017855e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:37:36,921] [INFO] [timer.py:259:stop] epoch=0/micro_step=22190/global_step=22190, RunningAvgSamplesPerSec=2.63799055189055, CurrSamplesPerSec=2.642069528927325, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:37:52,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=22200, skipped=0, lr=[8.777493788395173e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:37:52,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=22200/global_step=22200, RunningAvgSamplesPerSec=2.6379936542838767, CurrSamplesPerSec=2.6597524058979367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:38:07,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=22210, skipped=0, lr=[8.776389736836854e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:38:07,335] [INFO] [timer.py:259:stop] epoch=0/micro_step=22210/global_step=22210, RunningAvgSamplesPerSec=2.637994576766111, CurrSamplesPerSec=2.6474248077725973, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:38:22,562] [INFO] [logging.py:96:log_dist] [Rank 0] step=22220, skipped=0, lr=[8.775285256468262e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:38:22,584] [INFO] [timer.py:259:stop] epoch=0/micro_step=22220/global_step=22220, RunningAvgSamplesPerSec=2.637992913759745, CurrSamplesPerSec=2.6297297322470436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:38:37,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=22230, skipped=0, lr=[8.774180347414809e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:38:37,846] [INFO] [timer.py:259:stop] epoch=0/micro_step=22230/global_step=22230, RunningAvgSamplesPerSec=2.6379904471016173, CurrSamplesPerSec=2.6546390846704604, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:38:53,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=22240, skipped=0, lr=[8.77307500980196e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:38:53,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=22240/global_step=22240, RunningAvgSamplesPerSec=2.6379935175125397, CurrSamplesPerSec=2.6381285562703485, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:39:08,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=22250, skipped=0, lr=[8.771969243755227e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:39:08,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=22250/global_step=22250, RunningAvgSamplesPerSec=2.6379956675721896, CurrSamplesPerSec=2.648341691740367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:39:23,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=22260, skipped=0, lr=[8.770863049400169e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:39:23,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=22260/global_step=22260, RunningAvgSamplesPerSec=2.6379932698791806, CurrSamplesPerSec=2.6513006068569602, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:39:38,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=22270, skipped=0, lr=[8.769756426862394e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:39:38,718] [INFO] [timer.py:259:stop] epoch=0/micro_step=22270/global_step=22270, RunningAvgSamplesPerSec=2.637995519804301, CurrSamplesPerSec=2.609469387690073, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:39:53,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=22280, skipped=0, lr=[8.768649376267561e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:39:53,920] [INFO] [timer.py:259:stop] epoch=0/micro_step=22280/global_step=22280, RunningAvgSamplesPerSec=2.6379977001083645, CurrSamplesPerSec=2.6427674661498255, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:40:09,154] [INFO] [logging.py:96:log_dist] [Rank 0] step=22290, skipped=0, lr=[8.767541897741373e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:40:09,168] [INFO] [timer.py:259:stop] epoch=0/micro_step=22290/global_step=22290, RunningAvgSamplesPerSec=2.637996186286094, CurrSamplesPerSec=2.57772651331167, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:40:24,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=22300, skipped=0, lr=[8.76643399140959e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:40:24,420] [INFO] [timer.py:259:stop] epoch=0/micro_step=22300/global_step=22300, RunningAvgSamplesPerSec=2.63799758326974, CurrSamplesPerSec=2.647002936905951, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:40:39,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=22310, skipped=0, lr=[8.765325657398009e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:40:39,716] [INFO] [timer.py:259:stop] epoch=0/micro_step=22310/global_step=22310, RunningAvgSamplesPerSec=2.6379925310705, CurrSamplesPerSec=2.6410368257270207, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:40:54,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=22320, skipped=0, lr=[8.764216895832488e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:40:54,956] [INFO] [timer.py:259:stop] epoch=0/micro_step=22320/global_step=22320, RunningAvgSamplesPerSec=2.63799317249296, CurrSamplesPerSec=2.6437527801134513, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:41:10,198] [INFO] [logging.py:96:log_dist] [Rank 0] step=22330, skipped=0, lr=[8.76310770683892e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:41:10,199] [INFO] [timer.py:259:stop] epoch=0/micro_step=22330/global_step=22330, RunningAvgSamplesPerSec=2.6379933453099116, CurrSamplesPerSec=2.6508188621320596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:41:25,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=22340, skipped=0, lr=[8.761998090543258e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:41:25,414] [INFO] [timer.py:259:stop] epoch=0/micro_step=22340/global_step=22340, RunningAvgSamplesPerSec=2.6379951226124367, CurrSamplesPerSec=2.644700890507927, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:41:40,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=22350, skipped=0, lr=[8.760888047071501e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:41:40,666] [INFO] [timer.py:259:stop] epoch=0/micro_step=22350/global_step=22350, RunningAvgSamplesPerSec=2.6379949312478397, CurrSamplesPerSec=2.6613936759170307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:41:55,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=22360, skipped=0, lr=[8.75977757654969e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:41:55,861] [INFO] [timer.py:259:stop] epoch=0/micro_step=22360/global_step=22360, RunningAvgSamplesPerSec=2.63799860345931, CurrSamplesPerSec=2.6352175786509253, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:42:11,075] [INFO] [logging.py:96:log_dist] [Rank 0] step=22370, skipped=0, lr=[8.758666679103924e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:42:11,077] [INFO] [timer.py:259:stop] epoch=0/micro_step=22370/global_step=22370, RunningAvgSamplesPerSec=2.6380014003479615, CurrSamplesPerSec=2.674359646039844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:42:26,265] [INFO] [logging.py:96:log_dist] [Rank 0] step=22380, skipped=0, lr=[8.757555354860341e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:42:26,267] [INFO] [timer.py:259:stop] epoch=0/micro_step=22380/global_step=22380, RunningAvgSamplesPerSec=2.6380054839408884, CurrSamplesPerSec=2.646329892766026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:42:41,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=22390, skipped=0, lr=[8.756443603945138e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:42:41,498] [INFO] [timer.py:259:stop] epoch=0/micro_step=22390/global_step=22390, RunningAvgSamplesPerSec=2.6380074100191755, CurrSamplesPerSec=2.64539604761257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:42:56,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=22400, skipped=0, lr=[8.755331426484549e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:42:56,720] [INFO] [timer.py:259:stop] epoch=0/micro_step=22400/global_step=22400, RunningAvgSamplesPerSec=2.6380117073658407, CurrSamplesPerSec=2.6534832169251366, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:43:11,952] [INFO] [logging.py:96:log_dist] [Rank 0] step=22410, skipped=0, lr=[8.754218822604865e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:43:11,953] [INFO] [timer.py:259:stop] epoch=0/micro_step=22410/global_step=22410, RunningAvgSamplesPerSec=2.6380118035647837, CurrSamplesPerSec=2.654392123886864, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:43:27,193] [INFO] [logging.py:96:log_dist] [Rank 0] step=22420, skipped=0, lr=[8.753105792432424e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:43:27,203] [INFO] [timer.py:259:stop] epoch=0/micro_step=22420/global_step=22420, RunningAvgSamplesPerSec=2.638011022517406, CurrSamplesPerSec=2.6438436025606697, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:43:42,419] [INFO] [logging.py:96:log_dist] [Rank 0] step=22430, skipped=0, lr=[8.751992336093607e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:43:42,423] [INFO] [timer.py:259:stop] epoch=0/micro_step=22430/global_step=22430, RunningAvgSamplesPerSec=2.6380127069678174, CurrSamplesPerSec=2.6453547533304977, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:43:57,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=22440, skipped=0, lr=[8.750878453714851e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:43:57,638] [INFO] [timer.py:259:stop] epoch=0/micro_step=22440/global_step=22440, RunningAvgSamplesPerSec=2.6380147299007612, CurrSamplesPerSec=2.639391078440329, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:44:12,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=22450, skipped=0, lr=[8.749764145422635e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:44:12,876] [INFO] [timer.py:259:stop] epoch=0/micro_step=22450/global_step=22450, RunningAvgSamplesPerSec=2.63801399164576, CurrSamplesPerSec=2.6455674952371337, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:44:28,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=22460, skipped=0, lr=[8.74864941134349e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:44:28,128] [INFO] [timer.py:259:stop] epoch=0/micro_step=22460/global_step=22460, RunningAvgSamplesPerSec=2.638012375475893, CurrSamplesPerSec=2.6409137703789667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:44:43,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=22470, skipped=0, lr=[8.747534251603997e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:44:43,351] [INFO] [timer.py:259:stop] epoch=0/micro_step=22470/global_step=22470, RunningAvgSamplesPerSec=2.6380149884362707, CurrSamplesPerSec=2.6384745713471722, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:44:58,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=22480, skipped=0, lr=[8.746418666330778e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:44:58,575] [INFO] [timer.py:259:stop] epoch=0/micro_step=22480/global_step=22480, RunningAvgSamplesPerSec=2.638015137073901, CurrSamplesPerSec=2.625010344531115, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:45:13,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=22490, skipped=0, lr=[8.745302655650511e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:45:13,788] [INFO] [timer.py:259:stop] epoch=0/micro_step=22490/global_step=22490, RunningAvgSamplesPerSec=2.6380164917129743, CurrSamplesPerSec=2.6347102143476193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:45:29,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=22500, skipped=0, lr=[8.744186219689918e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:45:29,035] [INFO] [timer.py:259:stop] epoch=0/micro_step=22500/global_step=22500, RunningAvgSamplesPerSec=2.6380156699836315, CurrSamplesPerSec=2.5868704400168507, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:45:44,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=22510, skipped=0, lr=[8.743069358575772e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:45:44,254] [INFO] [timer.py:259:stop] epoch=0/micro_step=22510/global_step=22510, RunningAvgSamplesPerSec=2.6380168691912753, CurrSamplesPerSec=2.63706908674001, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:45:59,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=22520, skipped=0, lr=[8.741952072434892e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:45:59,473] [INFO] [timer.py:259:stop] epoch=0/micro_step=22520/global_step=22520, RunningAvgSamplesPerSec=2.6380173607543793, CurrSamplesPerSec=2.640487322682663, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:46:14,666] [INFO] [logging.py:96:log_dist] [Rank 0] step=22530, skipped=0, lr=[8.740834361394148e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:46:14,686] [INFO] [timer.py:259:stop] epoch=0/micro_step=22530/global_step=22530, RunningAvgSamplesPerSec=2.6380199053561277, CurrSamplesPerSec=2.6409919258253494, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:46:29,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=22540, skipped=0, lr=[8.739716225580455e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:46:29,924] [INFO] [timer.py:259:stop] epoch=0/micro_step=22540/global_step=22540, RunningAvgSamplesPerSec=2.638019465458849, CurrSamplesPerSec=2.6030223179063436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:46:45,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=22550, skipped=0, lr=[8.738597665120777e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:46:45,145] [INFO] [timer.py:259:stop] epoch=0/micro_step=22550/global_step=22550, RunningAvgSamplesPerSec=2.63802067048048, CurrSamplesPerSec=2.638581215391476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:47:00,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=22560, skipped=0, lr=[8.737478680142126e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:47:00,376] [INFO] [timer.py:259:stop] epoch=0/micro_step=22560/global_step=22560, RunningAvgSamplesPerSec=2.63802116504143, CurrSamplesPerSec=2.631983068519186, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:47:15,621] [INFO] [logging.py:96:log_dist] [Rank 0] step=22570, skipped=0, lr=[8.736359270771567e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:47:15,622] [INFO] [timer.py:259:stop] epoch=0/micro_step=22570/global_step=22570, RunningAvgSamplesPerSec=2.638021876984794, CurrSamplesPerSec=2.6506253757147076, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:47:30,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=22580, skipped=0, lr=[8.735239437136207e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:47:30,867] [INFO] [timer.py:259:stop] epoch=0/micro_step=22580/global_step=22580, RunningAvgSamplesPerSec=2.6380226353296927, CurrSamplesPerSec=2.6392910120647572, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:47:46,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=22590, skipped=0, lr=[8.734119179363203e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:47:46,115] [INFO] [timer.py:259:stop] epoch=0/micro_step=22590/global_step=22590, RunningAvgSamplesPerSec=2.638021477049241, CurrSamplesPerSec=2.6365311772386297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:48:01,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=22600, skipped=0, lr=[8.73299849757976e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:48:01,389] [INFO] [timer.py:259:stop] epoch=0/micro_step=22600/global_step=22600, RunningAvgSamplesPerSec=2.6380195678251024, CurrSamplesPerSec=2.629346859181123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:48:16,569] [INFO] [logging.py:96:log_dist] [Rank 0] step=22610, skipped=0, lr=[8.731877391913135e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:48:16,589] [INFO] [timer.py:259:stop] epoch=0/micro_step=22610/global_step=22610, RunningAvgSamplesPerSec=2.6380226679730048, CurrSamplesPerSec=2.655715244243503, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:48:31,851] [INFO] [logging.py:96:log_dist] [Rank 0] step=22620, skipped=0, lr=[8.730755862490626e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:48:31,852] [INFO] [timer.py:259:stop] epoch=0/micro_step=22620/global_step=22620, RunningAvgSamplesPerSec=2.6380199876602513, CurrSamplesPerSec=2.6450703166369203, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:48:47,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=22630, skipped=0, lr=[8.729633909439586e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:48:47,099] [INFO] [timer.py:259:stop] epoch=0/micro_step=22630/global_step=22630, RunningAvgSamplesPerSec=2.6380196727715974, CurrSamplesPerSec=2.626420680856173, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:49:02,323] [INFO] [logging.py:96:log_dist] [Rank 0] step=22640, skipped=0, lr=[8.728511532887412e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:49:02,324] [INFO] [timer.py:259:stop] epoch=0/micro_step=22640/global_step=22640, RunningAvgSamplesPerSec=2.6380209767663616, CurrSamplesPerSec=2.629775898856896, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:49:17,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=22650, skipped=0, lr=[8.727388732961547e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:49:17,541] [INFO] [timer.py:259:stop] epoch=0/micro_step=22650/global_step=22650, RunningAvgSamplesPerSec=2.6380223746291573, CurrSamplesPerSec=2.637875118405741, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:49:32,820] [INFO] [logging.py:96:log_dist] [Rank 0] step=22660, skipped=0, lr=[8.726265509789494e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:49:32,822] [INFO] [timer.py:259:stop] epoch=0/micro_step=22660/global_step=22660, RunningAvgSamplesPerSec=2.6380219091147605, CurrSamplesPerSec=2.6365983003373934, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:49:48,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=22670, skipped=0, lr=[8.725141863498784e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:49:48,089] [INFO] [timer.py:259:stop] epoch=0/micro_step=22670/global_step=22670, RunningAvgSamplesPerSec=2.638022349017257, CurrSamplesPerSec=2.6359798199095352, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:50:03,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=22680, skipped=0, lr=[8.724017794217017e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:50:03,329] [INFO] [timer.py:259:stop] epoch=0/micro_step=22680/global_step=22680, RunningAvgSamplesPerSec=2.6380236046123287, CurrSamplesPerSec=2.6527938705154597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:50:18,564] [INFO] [logging.py:96:log_dist] [Rank 0] step=22690, skipped=0, lr=[8.722893302071829e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:50:18,573] [INFO] [timer.py:259:stop] epoch=0/micro_step=22690/global_step=22690, RunningAvgSamplesPerSec=2.6380236810346323, CurrSamplesPerSec=2.5999956444875822, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:50:33,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=22700, skipped=0, lr=[8.721768387190902e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:50:33,859] [INFO] [timer.py:259:stop] epoch=0/micro_step=22700/global_step=22700, RunningAvgSamplesPerSec=2.6380208176172832, CurrSamplesPerSec=2.626356952921601, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:50:49,055] [INFO] [logging.py:96:log_dist] [Rank 0] step=22710, skipped=0, lr=[8.720643049701976e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:50:49,056] [INFO] [timer.py:259:stop] epoch=0/micro_step=22710/global_step=22710, RunningAvgSamplesPerSec=2.6380237799186084, CurrSamplesPerSec=2.593056887811997, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:51:04,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=22720, skipped=0, lr=[8.719517289732829e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:51:04,293] [INFO] [timer.py:259:stop] epoch=0/micro_step=22720/global_step=22720, RunningAvgSamplesPerSec=2.6380234117486165, CurrSamplesPerSec=2.638201983538033, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:51:19,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=22730, skipped=0, lr=[8.718391107411294e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:51:19,593] [INFO] [timer.py:259:stop] epoch=0/micro_step=22730/global_step=22730, RunningAvgSamplesPerSec=2.638017889116792, CurrSamplesPerSec=2.634191464677066, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:51:34,861] [INFO] [logging.py:96:log_dist] [Rank 0] step=22740, skipped=0, lr=[8.717264502865248e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:51:34,867] [INFO] [timer.py:259:stop] epoch=0/micro_step=22740/global_step=22740, RunningAvgSamplesPerSec=2.638016578484791, CurrSamplesPerSec=2.650130061959698, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:51:50,103] [INFO] [logging.py:96:log_dist] [Rank 0] step=22750, skipped=0, lr=[8.71613747622262e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:51:50,112] [INFO] [timer.py:259:stop] epoch=0/micro_step=22750/global_step=22750, RunningAvgSamplesPerSec=2.6380168969382276, CurrSamplesPerSec=2.633746098685577, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:52:05,359] [INFO] [logging.py:96:log_dist] [Rank 0] step=22760, skipped=0, lr=[8.715010027611381e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:52:05,360] [INFO] [timer.py:259:stop] epoch=0/micro_step=22760/global_step=22760, RunningAvgSamplesPerSec=2.6380155562721876, CurrSamplesPerSec=2.634806623358396, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:52:20,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=22770, skipped=0, lr=[8.713882157159554e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:52:20,791] [INFO] [timer.py:259:stop] epoch=0/micro_step=22770/global_step=22770, RunningAvgSamplesPerSec=2.6380015145260773, CurrSamplesPerSec=2.5626107870941013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:52:36,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=22780, skipped=0, lr=[8.712753864995211e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:52:36,472] [INFO] [timer.py:259:stop] epoch=0/micro_step=22780/global_step=22780, RunningAvgSamplesPerSec=2.6379694269972416, CurrSamplesPerSec=2.566534137481083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:52:51,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=22790, skipped=0, lr=[8.711625151246467e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:52:51,956] [INFO] [timer.py:259:stop] epoch=0/micro_step=22790/global_step=22790, RunningAvgSamplesPerSec=2.637954035626368, CurrSamplesPerSec=2.619445867307136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:53:07,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=22800, skipped=0, lr=[8.71049601604149e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:53:07,251] [INFO] [timer.py:259:stop] epoch=0/micro_step=22800/global_step=22800, RunningAvgSamplesPerSec=2.637952501113658, CurrSamplesPerSec=2.637666929296556, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:53:22,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=22810, skipped=0, lr=[8.709366459508493e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:53:22,434] [INFO] [timer.py:259:stop] epoch=0/micro_step=22810/global_step=22810, RunningAvgSamplesPerSec=2.637956856477537, CurrSamplesPerSec=2.6579068161210704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:53:37,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=22820, skipped=0, lr=[8.708236481775738e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:53:37,750] [INFO] [timer.py:259:stop] epoch=0/micro_step=22820/global_step=22820, RunningAvgSamplesPerSec=2.637950731625374, CurrSamplesPerSec=2.625704228056498, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:53:53,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=22830, skipped=0, lr=[8.707106082971532e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:53:53,231] [INFO] [timer.py:259:stop] epoch=0/micro_step=22830/global_step=22830, RunningAvgSamplesPerSec=2.6379357669271353, CurrSamplesPerSec=2.564526272139301, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:54:08,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=22840, skipped=0, lr=[8.705975263224236e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:54:08,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=22840/global_step=22840, RunningAvgSamplesPerSec=2.6379022673759094, CurrSamplesPerSec=2.545573480339296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:54:24,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=22850, skipped=0, lr=[8.704844022662252e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:54:24,277] [INFO] [timer.py:259:stop] epoch=0/micro_step=22850/global_step=22850, RunningAvgSamplesPerSec=2.6378980949315385, CurrSamplesPerSec=2.61488106346973, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:54:39,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=22860, skipped=0, lr=[8.703712361414034e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:54:39,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=22860/global_step=22860, RunningAvgSamplesPerSec=2.6378865551721233, CurrSamplesPerSec=2.5887153238193714, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:54:54,996] [INFO] [logging.py:96:log_dist] [Rank 0] step=22870, skipped=0, lr=[8.702580279608082e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:54:54,997] [INFO] [timer.py:259:stop] epoch=0/micro_step=22870/global_step=22870, RunningAvgSamplesPerSec=2.637880876927376, CurrSamplesPerSec=2.6135616657062046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:55:10,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=22880, skipped=0, lr=[8.701447777372944e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:55:10,376] [INFO] [timer.py:259:stop] epoch=0/micro_step=22880/global_step=22880, RunningAvgSamplesPerSec=2.6378701025091797, CurrSamplesPerSec=2.6487936808252965, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:55:25,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=22890, skipped=0, lr=[8.700314854837216e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:55:25,653] [INFO] [timer.py:259:stop] epoch=0/micro_step=22890/global_step=22890, RunningAvgSamplesPerSec=2.6378667538648948, CurrSamplesPerSec=2.640432052509172, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:55:40,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=22900, skipped=0, lr=[8.699181512129545e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:55:40,997] [INFO] [timer.py:259:stop] epoch=0/micro_step=22900/global_step=22900, RunningAvgSamplesPerSec=2.6378592094354016, CurrSamplesPerSec=2.6315743596746057, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:55:56,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=22910, skipped=0, lr=[8.698047749378616e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:55:56,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=22910/global_step=22910, RunningAvgSamplesPerSec=2.6378585109217694, CurrSamplesPerSec=2.6280387612899196, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:56:11,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=22920, skipped=0, lr=[8.696913566713172e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:56:11,553] [INFO] [timer.py:259:stop] epoch=0/micro_step=22920/global_step=22920, RunningAvgSamplesPerSec=2.637852293576811, CurrSamplesPerSec=2.6322849340119587, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:56:26,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=22930, skipped=0, lr=[8.695778964262001e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:56:26,849] [INFO] [timer.py:259:stop] epoch=0/micro_step=22930/global_step=22930, RunningAvgSamplesPerSec=2.6378468646277944, CurrSamplesPerSec=2.633815560904158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:56:42,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=22940, skipped=0, lr=[8.694643942153935e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:56:42,123] [INFO] [timer.py:259:stop] epoch=0/micro_step=22940/global_step=22940, RunningAvgSamplesPerSec=2.6378432261675666, CurrSamplesPerSec=2.620693841657688, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:56:57,419] [INFO] [logging.py:96:log_dist] [Rank 0] step=22950, skipped=0, lr=[8.693508500517858e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:56:57,421] [INFO] [timer.py:259:stop] epoch=0/micro_step=22950/global_step=22950, RunningAvgSamplesPerSec=2.6378384964738673, CurrSamplesPerSec=2.63906557911448, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:57:12,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=22960, skipped=0, lr=[8.692372639482697e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:57:12,662] [INFO] [timer.py:259:stop] epoch=0/micro_step=22960/global_step=22960, RunningAvgSamplesPerSec=2.637838588623286, CurrSamplesPerSec=2.657577155176329, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:57:27,833] [INFO] [logging.py:96:log_dist] [Rank 0] step=22970, skipped=0, lr=[8.691236359177432e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:57:27,835] [INFO] [timer.py:259:stop] epoch=0/micro_step=22970/global_step=22970, RunningAvgSamplesPerSec=2.6378439944138345, CurrSamplesPerSec=2.65307074081787, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:57:43,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=22980, skipped=0, lr=[8.690099659731088e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:57:43,020] [INFO] [timer.py:259:stop] epoch=0/micro_step=22980/global_step=22980, RunningAvgSamplesPerSec=2.6378478338878617, CurrSamplesPerSec=2.655882566437591, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:57:58,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=22990, skipped=0, lr=[8.688962541272736e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:57:58,220] [INFO] [timer.py:259:stop] epoch=0/micro_step=22990/global_step=22990, RunningAvgSamplesPerSec=2.6378512140986032, CurrSamplesPerSec=2.6252937695092906, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:58:13,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=23000, skipped=0, lr=[8.687825003931498e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:58:13,411] [INFO] [timer.py:259:stop] epoch=0/micro_step=23000/global_step=23000, RunningAvgSamplesPerSec=2.6378547452263743, CurrSamplesPerSec=2.654664287334265, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:58:28,724] [INFO] [logging.py:96:log_dist] [Rank 0] step=23010, skipped=0, lr=[8.68668704783654e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:58:28,740] [INFO] [timer.py:259:stop] epoch=0/micro_step=23010/global_step=23010, RunningAvgSamplesPerSec=2.637848613612412, CurrSamplesPerSec=2.5515005030443247, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:58:44,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=23020, skipped=0, lr=[8.685548673117076e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:58:44,047] [INFO] [timer.py:259:stop] epoch=0/micro_step=23020/global_step=23020, RunningAvgSamplesPerSec=2.637844816563099, CurrSamplesPerSec=2.6190635295608313, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:58:59,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=23030, skipped=0, lr=[8.684409879902373e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:58:59,647] [INFO] [timer.py:259:stop] epoch=0/micro_step=23030/global_step=23030, RunningAvgSamplesPerSec=2.6378182853298813, CurrSamplesPerSec=2.5464474433651647, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:59:15,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=23040, skipped=0, lr=[8.68327066832174e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:59:15,169] [INFO] [timer.py:259:stop] epoch=0/micro_step=23040/global_step=23040, RunningAvgSamplesPerSec=2.6377997290349224, CurrSamplesPerSec=2.6225897620769856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:59:30,551] [INFO] [logging.py:96:log_dist] [Rank 0] step=23050, skipped=0, lr=[8.682131038504533e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:59:30,553] [INFO] [timer.py:259:stop] epoch=0/micro_step=23050/global_step=23050, RunningAvgSamplesPerSec=2.6377904367030767, CurrSamplesPerSec=2.6456225634226844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 09:59:45,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=23060, skipped=0, lr=[8.680990990580158e-06], mom=[(0.9, 0.95)] +[2024-11-01 09:59:45,863] [INFO] [timer.py:259:stop] epoch=0/micro_step=23060/global_step=23060, RunningAvgSamplesPerSec=2.6377869681753694, CurrSamplesPerSec=2.6366969194841405, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:00:01,215] [INFO] [logging.py:96:log_dist] [Rank 0] step=23070, skipped=0, lr=[8.679850524678069e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:00:01,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=23070/global_step=23070, RunningAvgSamplesPerSec=2.6377788193269502, CurrSamplesPerSec=2.627375325303704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:00:16,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=23080, skipped=0, lr=[8.678709640927764e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:00:16,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=23080/global_step=23080, RunningAvgSamplesPerSec=2.6377757644973565, CurrSamplesPerSec=2.626536221320291, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:00:31,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=23090, skipped=0, lr=[8.677568339458793e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:00:31,811] [INFO] [timer.py:259:stop] epoch=0/micro_step=23090/global_step=23090, RunningAvgSamplesPerSec=2.637770612013042, CurrSamplesPerSec=2.6062872113656663, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:00:47,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=23100, skipped=0, lr=[8.676426620400749e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:00:47,044] [INFO] [timer.py:259:stop] epoch=0/micro_step=23100/global_step=23100, RunningAvgSamplesPerSec=2.637771831263931, CurrSamplesPerSec=2.644467029571952, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:01:02,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=23110, skipped=0, lr=[8.675284483883278e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:01:02,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=23110/global_step=23110, RunningAvgSamplesPerSec=2.637771013730595, CurrSamplesPerSec=2.633412069671353, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:01:17,482] [INFO] [logging.py:96:log_dist] [Rank 0] step=23120, skipped=0, lr=[8.674141930036067e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:01:17,483] [INFO] [timer.py:259:stop] epoch=0/micro_step=23120/global_step=23120, RunningAvgSamplesPerSec=2.637775930043737, CurrSamplesPerSec=2.6530485051594486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:01:32,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=23130, skipped=0, lr=[8.672998958988852e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:01:32,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=23130/global_step=23130, RunningAvgSamplesPerSec=2.637774399309674, CurrSamplesPerSec=2.6455407963360025, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:01:47,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=23140, skipped=0, lr=[8.671855570871422e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:01:47,955] [INFO] [timer.py:259:stop] epoch=0/micro_step=23140/global_step=23140, RunningAvgSamplesPerSec=2.637776425333726, CurrSamplesPerSec=2.6610901626325276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:02:03,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=23150, skipped=0, lr=[8.670711765813606e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:02:03,184] [INFO] [timer.py:259:stop] epoch=0/micro_step=23150/global_step=23150, RunningAvgSamplesPerSec=2.6377763972700037, CurrSamplesPerSec=2.6396448076989816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:02:18,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=23160, skipped=0, lr=[8.669567543945286e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:02:18,446] [INFO] [timer.py:259:stop] epoch=0/micro_step=23160/global_step=23160, RunningAvgSamplesPerSec=2.6377742393949806, CurrSamplesPerSec=2.622843551489822, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:02:33,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=23170, skipped=0, lr=[8.668422905396385e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:02:33,634] [INFO] [timer.py:259:stop] epoch=0/micro_step=23170/global_step=23170, RunningAvgSamplesPerSec=2.6377780142239255, CurrSamplesPerSec=2.6337150899494888, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:02:48,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=23180, skipped=0, lr=[8.667277850296882e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:02:48,873] [INFO] [timer.py:259:stop] epoch=0/micro_step=23180/global_step=23180, RunningAvgSamplesPerSec=2.6377778087649673, CurrSamplesPerSec=2.6069258611219346, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:03:04,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=23190, skipped=0, lr=[8.666132378776792e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:03:04,127] [INFO] [timer.py:259:stop] epoch=0/micro_step=23190/global_step=23190, RunningAvgSamplesPerSec=2.637777426885057, CurrSamplesPerSec=2.638024437604424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:03:19,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=23200, skipped=0, lr=[8.66498649096619e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:03:19,382] [INFO] [timer.py:259:stop] epoch=0/micro_step=23200/global_step=23200, RunningAvgSamplesPerSec=2.6377757847241305, CurrSamplesPerSec=2.6092571361357644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:03:34,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=23210, skipped=0, lr=[8.663840186995189e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:03:34,546] [INFO] [timer.py:259:stop] epoch=0/micro_step=23210/global_step=23210, RunningAvgSamplesPerSec=2.6377811862748093, CurrSamplesPerSec=2.6389987454389083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:03:49,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=23220, skipped=0, lr=[8.662693466993952e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:03:49,816] [INFO] [timer.py:259:stop] epoch=0/micro_step=23220/global_step=23220, RunningAvgSamplesPerSec=2.6377790100425353, CurrSamplesPerSec=2.5762738163557724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:04:05,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=23230, skipped=0, lr=[8.66154633109269e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:04:05,005] [INFO] [timer.py:259:stop] epoch=0/micro_step=23230/global_step=23230, RunningAvgSamplesPerSec=2.6377821739926266, CurrSamplesPerSec=2.6350851319296407, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:04:20,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=23240, skipped=0, lr=[8.66039877942166e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:04:20,245] [INFO] [timer.py:259:stop] epoch=0/micro_step=23240/global_step=23240, RunningAvgSamplesPerSec=2.6377824132281087, CurrSamplesPerSec=2.6524990252438445, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:04:35,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=23250, skipped=0, lr=[8.659250812111166e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:04:35,470] [INFO] [timer.py:259:stop] epoch=0/micro_step=23250/global_step=23250, RunningAvgSamplesPerSec=2.6377833998226405, CurrSamplesPerSec=2.6368800889023856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:04:50,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=23260, skipped=0, lr=[8.658102429291564e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:04:50,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=23260/global_step=23260, RunningAvgSamplesPerSec=2.6377819765625183, CurrSamplesPerSec=2.646575773054545, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:05:05,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=23270, skipped=0, lr=[8.65695363109325e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:05:05,973] [INFO] [timer.py:259:stop] epoch=0/micro_step=23270/global_step=23270, RunningAvgSamplesPerSec=2.637783357435153, CurrSamplesPerSec=2.6464313285031063, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:05:21,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=23280, skipped=0, lr=[8.65580441764667e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:05:21,240] [INFO] [timer.py:259:stop] epoch=0/micro_step=23280/global_step=23280, RunningAvgSamplesPerSec=2.6377816166686254, CurrSamplesPerSec=2.628515967323414, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:05:36,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=23290, skipped=0, lr=[8.65465478908232e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:05:36,539] [INFO] [timer.py:259:stop] epoch=0/micro_step=23290/global_step=23290, RunningAvgSamplesPerSec=2.637777480728665, CurrSamplesPerSec=2.6522193393207094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:05:51,811] [INFO] [logging.py:96:log_dist] [Rank 0] step=23300, skipped=0, lr=[8.653504745530738e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:05:51,812] [INFO] [timer.py:259:stop] epoch=0/micro_step=23300/global_step=23300, RunningAvgSamplesPerSec=2.63777608237808, CurrSamplesPerSec=2.6621673357406155, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:06:07,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=23310, skipped=0, lr=[8.652354287122513e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:06:07,056] [INFO] [timer.py:259:stop] epoch=0/micro_step=23310/global_step=23310, RunningAvgSamplesPerSec=2.6377754706856558, CurrSamplesPerSec=2.641846117105916, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:06:22,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=23320, skipped=0, lr=[8.65120341398828e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:06:22,300] [INFO] [timer.py:259:stop] epoch=0/micro_step=23320/global_step=23320, RunningAvgSamplesPerSec=2.637774939321374, CurrSamplesPerSec=2.6416140085342126, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:06:37,510] [INFO] [logging.py:96:log_dist] [Rank 0] step=23330, skipped=0, lr=[8.650052126258722e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:06:37,512] [INFO] [timer.py:259:stop] epoch=0/micro_step=23330/global_step=23330, RunningAvgSamplesPerSec=2.6377762117580845, CurrSamplesPerSec=2.6372643300581067, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:06:52,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=23340, skipped=0, lr=[8.648900424064568e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:06:52,718] [INFO] [timer.py:259:stop] epoch=0/micro_step=23340/global_step=23340, RunningAvgSamplesPerSec=2.6377806047681185, CurrSamplesPerSec=2.6688822195145345, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:07:07,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=23350, skipped=0, lr=[8.647748307536592e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:07:07,941] [INFO] [timer.py:259:stop] epoch=0/micro_step=23350/global_step=23350, RunningAvgSamplesPerSec=2.6377820962777854, CurrSamplesPerSec=2.6427050238334755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:07:23,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=23360, skipped=0, lr=[8.646595776805618e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:07:23,251] [INFO] [timer.py:259:stop] epoch=0/micro_step=23360/global_step=23360, RunningAvgSamplesPerSec=2.6377779596000464, CurrSamplesPerSec=2.6367839425051645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:07:38,479] [INFO] [logging.py:96:log_dist] [Rank 0] step=23370, skipped=0, lr=[8.645442832002517e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:07:38,480] [INFO] [timer.py:259:stop] epoch=0/micro_step=23370/global_step=23370, RunningAvgSamplesPerSec=2.637779033986296, CurrSamplesPerSec=2.650812160831557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:07:53,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=23380, skipped=0, lr=[8.644289473258207e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:07:53,732] [INFO] [timer.py:259:stop] epoch=0/micro_step=23380/global_step=23380, RunningAvgSamplesPerSec=2.6377787174668534, CurrSamplesPerSec=2.644419095272386, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:08:08,957] [INFO] [logging.py:96:log_dist] [Rank 0] step=23390, skipped=0, lr=[8.64313570070365e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:08:08,975] [INFO] [timer.py:259:stop] epoch=0/micro_step=23390/global_step=23390, RunningAvgSamplesPerSec=2.63777785964015, CurrSamplesPerSec=2.6384691771304083, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:08:24,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=23400, skipped=0, lr=[8.64198151446986e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:08:24,281] [INFO] [timer.py:259:stop] epoch=0/micro_step=23400/global_step=23400, RunningAvgSamplesPerSec=2.6377735862860563, CurrSamplesPerSec=2.6396967223145156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:08:39,491] [INFO] [logging.py:96:log_dist] [Rank 0] step=23410, skipped=0, lr=[8.640826914687893e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:08:39,492] [INFO] [timer.py:259:stop] epoch=0/micro_step=23410/global_step=23410, RunningAvgSamplesPerSec=2.6377751299193015, CurrSamplesPerSec=2.636512532539944, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:08:54,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=23420, skipped=0, lr=[8.639671901488853e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:08:54,651] [INFO] [timer.py:259:stop] epoch=0/micro_step=23420/global_step=23420, RunningAvgSamplesPerSec=2.6377815903332804, CurrSamplesPerSec=2.642972714017836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:09:09,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=23430, skipped=0, lr=[8.638516475003896e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:09:09,913] [INFO] [timer.py:259:stop] epoch=0/micro_step=23430/global_step=23430, RunningAvgSamplesPerSec=2.6377811190361187, CurrSamplesPerSec=2.629156906505156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:09:25,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=23440, skipped=0, lr=[8.637360635364218e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:09:25,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=23440/global_step=23440, RunningAvgSamplesPerSec=2.637780646831901, CurrSamplesPerSec=2.6478363658324446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:09:40,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=23450, skipped=0, lr=[8.636204382701068e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:09:40,361] [INFO] [timer.py:259:stop] epoch=0/micro_step=23450/global_step=23450, RunningAvgSamplesPerSec=2.637782366915777, CurrSamplesPerSec=2.620772033060561, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:09:55,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=23460, skipped=0, lr=[8.635047717145735e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:09:55,572] [INFO] [timer.py:259:stop] epoch=0/micro_step=23460/global_step=23460, RunningAvgSamplesPerSec=2.6377862383581, CurrSamplesPerSec=2.6450523849949468, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:10:10,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=23470, skipped=0, lr=[8.63389063882956e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:10:10,866] [INFO] [timer.py:259:stop] epoch=0/micro_step=23470/global_step=23470, RunningAvgSamplesPerSec=2.637782705078396, CurrSamplesPerSec=2.6194192840673307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:10:26,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=23480, skipped=0, lr=[8.632733147883934e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:10:26,133] [INFO] [timer.py:259:stop] epoch=0/micro_step=23480/global_step=23480, RunningAvgSamplesPerSec=2.637781120062664, CurrSamplesPerSec=2.641092952751077, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:10:41,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=23490, skipped=0, lr=[8.631575244440284e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:10:41,415] [INFO] [timer.py:259:stop] epoch=0/micro_step=23490/global_step=23490, RunningAvgSamplesPerSec=2.637778324191714, CurrSamplesPerSec=2.6571958104519293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:10:56,697] [INFO] [logging.py:96:log_dist] [Rank 0] step=23500, skipped=0, lr=[8.630416928630093e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:10:56,716] [INFO] [timer.py:259:stop] epoch=0/micro_step=23500/global_step=23500, RunningAvgSamplesPerSec=2.6377744490229964, CurrSamplesPerSec=2.6439627644094212, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:11:11,977] [INFO] [logging.py:96:log_dist] [Rank 0] step=23510, skipped=0, lr=[8.629258200584889e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:11:12,002] [INFO] [timer.py:259:stop] epoch=0/micro_step=23510/global_step=23510, RunningAvgSamplesPerSec=2.6377725868268507, CurrSamplesPerSec=2.637031782389308, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:11:27,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=23520, skipped=0, lr=[8.628099060436243e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:11:27,221] [INFO] [timer.py:259:stop] epoch=0/micro_step=23520/global_step=23520, RunningAvgSamplesPerSec=2.6377739487859033, CurrSamplesPerSec=2.6326686632755285, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:11:42,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=23530, skipped=0, lr=[8.62693950831578e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:11:42,497] [INFO] [timer.py:259:stop] epoch=0/micro_step=23530/global_step=23530, RunningAvgSamplesPerSec=2.637772355288705, CurrSamplesPerSec=2.643559907265566, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:11:57,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=23540, skipped=0, lr=[8.625779544355163e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:11:57,715] [INFO] [timer.py:259:stop] epoch=0/micro_step=23540/global_step=23540, RunningAvgSamplesPerSec=2.637773802293565, CurrSamplesPerSec=2.638491169075559, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:12:12,972] [INFO] [logging.py:96:log_dist] [Rank 0] step=23550, skipped=0, lr=[8.624619168686112e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:12:12,975] [INFO] [timer.py:259:stop] epoch=0/micro_step=23550/global_step=23550, RunningAvgSamplesPerSec=2.6377730516630162, CurrSamplesPerSec=2.6553739387108606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:12:28,208] [INFO] [logging.py:96:log_dist] [Rank 0] step=23560, skipped=0, lr=[8.623458381440385e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:12:28,210] [INFO] [timer.py:259:stop] epoch=0/micro_step=23560/global_step=23560, RunningAvgSamplesPerSec=2.6377738329371723, CurrSamplesPerSec=2.604392139694992, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:12:43,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=23570, skipped=0, lr=[8.622297182749787e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:12:43,483] [INFO] [timer.py:259:stop] epoch=0/micro_step=23570/global_step=23570, RunningAvgSamplesPerSec=2.637771529579208, CurrSamplesPerSec=2.6384044482495908, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:12:58,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=23580, skipped=0, lr=[8.621135572746174e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:12:58,746] [INFO] [timer.py:259:stop] epoch=0/micro_step=23580/global_step=23580, RunningAvgSamplesPerSec=2.6377705728761147, CurrSamplesPerSec=2.617188624769793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:13:13,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=23590, skipped=0, lr=[8.61997355156145e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:13:13,967] [INFO] [timer.py:259:stop] epoch=0/micro_step=23590/global_step=23590, RunningAvgSamplesPerSec=2.637772828510915, CurrSamplesPerSec=2.644423680217185, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:13:29,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=23600, skipped=0, lr=[8.61881111932756e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:13:29,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=23600/global_step=23600, RunningAvgSamplesPerSec=2.637775583723361, CurrSamplesPerSec=2.6427870320160958, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:13:44,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=23610, skipped=0, lr=[8.617648276176502e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:13:44,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=23610/global_step=23610, RunningAvgSamplesPerSec=2.637775055079088, CurrSamplesPerSec=2.631429484289535, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:13:59,703] [INFO] [logging.py:96:log_dist] [Rank 0] step=23620, skipped=0, lr=[8.616485022240312e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:13:59,714] [INFO] [timer.py:259:stop] epoch=0/micro_step=23620/global_step=23620, RunningAvgSamplesPerSec=2.637775066451338, CurrSamplesPerSec=2.5966190827873197, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:14:14,986] [INFO] [logging.py:96:log_dist] [Rank 0] step=23630, skipped=0, lr=[8.615321357651081e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:14:14,988] [INFO] [timer.py:259:stop] epoch=0/micro_step=23630/global_step=23630, RunningAvgSamplesPerSec=2.6377743210034663, CurrSamplesPerSec=2.6473329036535356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:14:30,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=23640, skipped=0, lr=[8.614157282540945e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:14:30,254] [INFO] [timer.py:259:stop] epoch=0/micro_step=23640/global_step=23640, RunningAvgSamplesPerSec=2.6377731618350935, CurrSamplesPerSec=2.6368117082015528, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:14:45,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=23650, skipped=0, lr=[8.612992797042081e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:14:45,426] [INFO] [timer.py:259:stop] epoch=0/micro_step=23650/global_step=23650, RunningAvgSamplesPerSec=2.637778806765493, CurrSamplesPerSec=2.63301531363422, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:15:00,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=23660, skipped=0, lr=[8.61182790128672e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:15:00,651] [INFO] [timer.py:259:stop] epoch=0/micro_step=23660/global_step=23660, RunningAvgSamplesPerSec=2.637780361581808, CurrSamplesPerSec=2.6474068441930685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:15:15,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=23670, skipped=0, lr=[8.610662595407136e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:15:15,851] [INFO] [timer.py:259:stop] epoch=0/micro_step=23670/global_step=23670, RunningAvgSamplesPerSec=2.6377832346719403, CurrSamplesPerSec=2.649609407575081, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:15:31,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=23680, skipped=0, lr=[8.609496879535648e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:15:31,113] [INFO] [timer.py:259:stop] epoch=0/micro_step=23680/global_step=23680, RunningAvgSamplesPerSec=2.63778231778004, CurrSamplesPerSec=2.637642048334653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:15:46,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=23690, skipped=0, lr=[8.608330753804624e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:15:46,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=23690/global_step=23690, RunningAvgSamplesPerSec=2.6377843083484502, CurrSamplesPerSec=2.658065570932101, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:16:01,488] [INFO] [logging.py:96:log_dist] [Rank 0] step=23700, skipped=0, lr=[8.60716421834648e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:16:01,501] [INFO] [timer.py:259:stop] epoch=0/micro_step=23700/global_step=23700, RunningAvgSamplesPerSec=2.637788558256706, CurrSamplesPerSec=2.6536015704707623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:16:16,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=23710, skipped=0, lr=[8.605997273293673e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:16:16,737] [INFO] [timer.py:259:stop] epoch=0/micro_step=23710/global_step=23710, RunningAvgSamplesPerSec=2.637788886619281, CurrSamplesPerSec=2.6453243048371755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:16:31,984] [INFO] [logging.py:96:log_dist] [Rank 0] step=23720, skipped=0, lr=[8.604829918778715e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:16:31,986] [INFO] [timer.py:259:stop] epoch=0/micro_step=23720/global_step=23720, RunningAvgSamplesPerSec=2.637788472421186, CurrSamplesPerSec=2.6502665373816914, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:16:47,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=23730, skipped=0, lr=[8.603662154934152e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:16:47,203] [INFO] [timer.py:259:stop] epoch=0/micro_step=23730/global_step=23730, RunningAvgSamplesPerSec=2.6377909151735697, CurrSamplesPerSec=2.660081763228568, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:17:02,436] [INFO] [logging.py:96:log_dist] [Rank 0] step=23740, skipped=0, lr=[8.602493981892592e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:17:02,437] [INFO] [timer.py:259:stop] epoch=0/micro_step=23740/global_step=23740, RunningAvgSamplesPerSec=2.6377918312148907, CurrSamplesPerSec=2.629255793844214, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:17:17,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=23750, skipped=0, lr=[8.601325399786675e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:17:17,686] [INFO] [timer.py:259:stop] epoch=0/micro_step=23750/global_step=23750, RunningAvgSamplesPerSec=2.6377925880029895, CurrSamplesPerSec=2.658885333792711, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:17:32,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=23760, skipped=0, lr=[8.600156408749099e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:17:32,950] [INFO] [timer.py:259:stop] epoch=0/micro_step=23760/global_step=23760, RunningAvgSamplesPerSec=2.6377927745386094, CurrSamplesPerSec=2.6528819593432007, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:17:48,176] [INFO] [logging.py:96:log_dist] [Rank 0] step=23770, skipped=0, lr=[8.5989870089126e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:17:48,178] [INFO] [timer.py:259:stop] epoch=0/micro_step=23770/global_step=23770, RunningAvgSamplesPerSec=2.6377936469397207, CurrSamplesPerSec=2.6413640590763072, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:18:03,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=23780, skipped=0, lr=[8.597817200409966e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:18:03,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=23780/global_step=23780, RunningAvgSamplesPerSec=2.6377952882365627, CurrSamplesPerSec=2.630131683446843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:18:18,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=23790, skipped=0, lr=[8.596646983374026e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:18:18,623] [INFO] [timer.py:259:stop] epoch=0/micro_step=23790/global_step=23790, RunningAvgSamplesPerSec=2.637796678461271, CurrSamplesPerSec=2.6270967985195286, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:18:33,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=23800, skipped=0, lr=[8.59547635793766e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:18:33,907] [INFO] [timer.py:259:stop] epoch=0/micro_step=23800/global_step=23800, RunningAvgSamplesPerSec=2.637795191735421, CurrSamplesPerSec=2.6480244298665703, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:18:49,205] [INFO] [logging.py:96:log_dist] [Rank 0] step=23810, skipped=0, lr=[8.594305324233794e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:18:49,240] [INFO] [timer.py:259:stop] epoch=0/micro_step=23810/global_step=23810, RunningAvgSamplesPerSec=2.637791801024712, CurrSamplesPerSec=2.611408846486063, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:19:04,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=23820, skipped=0, lr=[8.5931338823954e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:19:04,466] [INFO] [timer.py:259:stop] epoch=0/micro_step=23820/global_step=23820, RunningAvgSamplesPerSec=2.6377950479651595, CurrSamplesPerSec=2.657317862527106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:19:19,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=23830, skipped=0, lr=[8.591962032555492e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:19:19,826] [INFO] [timer.py:259:stop] epoch=0/micro_step=23830/global_step=23830, RunningAvgSamplesPerSec=2.6377908486368575, CurrSamplesPerSec=2.602560378689173, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:19:35,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=23840, skipped=0, lr=[8.590789774847137e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:19:35,057] [INFO] [timer.py:259:stop] epoch=0/micro_step=23840/global_step=23840, RunningAvgSamplesPerSec=2.6377933882257283, CurrSamplesPerSec=2.6594159636878496, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:19:50,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=23850, skipped=0, lr=[8.589617109403446e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:19:50,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=23850/global_step=23850, RunningAvgSamplesPerSec=2.6377916179634107, CurrSamplesPerSec=2.650848599561813, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:20:05,734] [INFO] [logging.py:96:log_dist] [Rank 0] step=23860, skipped=0, lr=[8.58844403635757e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:20:05,740] [INFO] [timer.py:259:stop] epoch=0/micro_step=23860/global_step=23860, RunningAvgSamplesPerSec=2.6377841524899135, CurrSamplesPerSec=2.660783764653641, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:20:21,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=23870, skipped=0, lr=[8.587270555842716e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:20:21,052] [INFO] [timer.py:259:stop] epoch=0/micro_step=23870/global_step=23870, RunningAvgSamplesPerSec=2.637780877003545, CurrSamplesPerSec=2.6394035353666934, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:20:36,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=23880, skipped=0, lr=[8.586096667992137e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:20:36,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=23880/global_step=23880, RunningAvgSamplesPerSec=2.6377788092679784, CurrSamplesPerSec=2.6273197797980163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:20:51,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=23890, skipped=0, lr=[8.584922372939122e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:20:51,709] [INFO] [timer.py:259:stop] epoch=0/micro_step=23890/global_step=23890, RunningAvgSamplesPerSec=2.6377737628613183, CurrSamplesPerSec=2.631233865921048, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:21:07,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=23900, skipped=0, lr=[8.583747670817013e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:21:07,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=23900/global_step=23900, RunningAvgSamplesPerSec=2.6377680108618566, CurrSamplesPerSec=2.6369915777708393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:21:22,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=23910, skipped=0, lr=[8.582572561759201e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:21:22,390] [INFO] [timer.py:259:stop] epoch=0/micro_step=23910/global_step=23910, RunningAvgSamplesPerSec=2.6377622616176657, CurrSamplesPerSec=2.6350801654365834, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:21:37,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=23920, skipped=0, lr=[8.581397045899119e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:21:37,732] [INFO] [timer.py:259:stop] epoch=0/micro_step=23920/global_step=23920, RunningAvgSamplesPerSec=2.637756749167453, CurrSamplesPerSec=2.639576698820362, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:21:53,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=23930, skipped=0, lr=[8.580221123370244e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:21:53,105] [INFO] [timer.py:259:stop] epoch=0/micro_step=23930/global_step=23930, RunningAvgSamplesPerSec=2.6377496759487995, CurrSamplesPerSec=2.638771702273889, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:22:08,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=23940, skipped=0, lr=[8.579044794306106e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:22:08,471] [INFO] [timer.py:259:stop] epoch=0/micro_step=23940/global_step=23940, RunningAvgSamplesPerSec=2.6377440623304165, CurrSamplesPerSec=2.6218094316686416, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:22:23,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=23950, skipped=0, lr=[8.577868058840277e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:22:23,802] [INFO] [timer.py:259:stop] epoch=0/micro_step=23950/global_step=23950, RunningAvgSamplesPerSec=2.637738442704806, CurrSamplesPerSec=2.636976242311533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:22:39,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=23960, skipped=0, lr=[8.576690917106377e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:22:39,127] [INFO] [timer.py:259:stop] epoch=0/micro_step=23960/global_step=23960, RunningAvgSamplesPerSec=2.637733691621098, CurrSamplesPerSec=2.584985168185514, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:22:54,397] [INFO] [logging.py:96:log_dist] [Rank 0] step=23970, skipped=0, lr=[8.575513369238067e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:22:54,400] [INFO] [timer.py:259:stop] epoch=0/micro_step=23970/global_step=23970, RunningAvgSamplesPerSec=2.6377320098665216, CurrSamplesPerSec=2.647739418810535, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:23:09,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=23980, skipped=0, lr=[8.574335415369061e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:23:09,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=23980/global_step=23980, RunningAvgSamplesPerSec=2.6377268604141486, CurrSamplesPerSec=2.594965595653934, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:23:25,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=23990, skipped=0, lr=[8.573157055633114e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:23:25,062] [INFO] [timer.py:259:stop] epoch=0/micro_step=23990/global_step=23990, RunningAvgSamplesPerSec=2.6377215438822845, CurrSamplesPerSec=2.6391847258018486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:23:40,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=24000, skipped=0, lr=[8.571978290164031e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:23:40,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=24000/global_step=24000, RunningAvgSamplesPerSec=2.6377176718209987, CurrSamplesPerSec=2.6462560123706775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:23:55,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=24010, skipped=0, lr=[8.570799119095661e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:23:55,685] [INFO] [timer.py:259:stop] epoch=0/micro_step=24010/global_step=24010, RunningAvgSamplesPerSec=2.6377137705398686, CurrSamplesPerSec=2.6265571923877054, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:24:10,939] [INFO] [logging.py:96:log_dist] [Rank 0] step=24020, skipped=0, lr=[8.569619542561898e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:24:10,940] [INFO] [timer.py:259:stop] epoch=0/micro_step=24020/global_step=24020, RunningAvgSamplesPerSec=2.6377138293212625, CurrSamplesPerSec=2.616973074631935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:24:26,238] [INFO] [logging.py:96:log_dist] [Rank 0] step=24030, skipped=0, lr=[8.568439560696684e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:24:26,239] [INFO] [timer.py:259:stop] epoch=0/micro_step=24030/global_step=24030, RunningAvgSamplesPerSec=2.637709742996766, CurrSamplesPerSec=2.645838269224668, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:24:41,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=24040, skipped=0, lr=[8.567259173634007e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:24:41,562] [INFO] [timer.py:259:stop] epoch=0/micro_step=24040/global_step=24040, RunningAvgSamplesPerSec=2.6377048815975193, CurrSamplesPerSec=2.625999722605479, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:24:56,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=24050, skipped=0, lr=[8.566078381507899e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:24:56,930] [INFO] [timer.py:259:stop] epoch=0/micro_step=24050/global_step=24050, RunningAvgSamplesPerSec=2.6376963212897966, CurrSamplesPerSec=2.470146900966238, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:25:12,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=24060, skipped=0, lr=[8.56489718445244e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:25:12,239] [INFO] [timer.py:259:stop] epoch=0/micro_step=24060/global_step=24060, RunningAvgSamplesPerSec=2.6376945005831165, CurrSamplesPerSec=2.641109999208443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:25:27,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=24070, skipped=0, lr=[8.563715582601755e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:25:27,617] [INFO] [timer.py:259:stop] epoch=0/micro_step=24070/global_step=24070, RunningAvgSamplesPerSec=2.6376869549887205, CurrSamplesPerSec=2.6171457568570866, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:25:42,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=24080, skipped=0, lr=[8.562533576090015e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:25:42,947] [INFO] [timer.py:259:stop] epoch=0/micro_step=24080/global_step=24080, RunningAvgSamplesPerSec=2.6376833633141157, CurrSamplesPerSec=2.6283244876589302, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:25:58,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=24090, skipped=0, lr=[8.561351165051439e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:25:58,233] [INFO] [timer.py:259:stop] epoch=0/micro_step=24090/global_step=24090, RunningAvgSamplesPerSec=2.637681095399751, CurrSamplesPerSec=2.609059931414379, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:26:13,564] [INFO] [logging.py:96:log_dist] [Rank 0] step=24100, skipped=0, lr=[8.56016834962029e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:26:13,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=24100/global_step=24100, RunningAvgSamplesPerSec=2.6376759875118263, CurrSamplesPerSec=2.631566929779619, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:26:28,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=24110, skipped=0, lr=[8.558985129930874e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:26:28,949] [INFO] [timer.py:259:stop] epoch=0/micro_step=24110/global_step=24110, RunningAvgSamplesPerSec=2.6376678146132173, CurrSamplesPerSec=2.5841378910877224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:26:44,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=24120, skipped=0, lr=[8.557801506117548e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:26:44,224] [INFO] [timer.py:259:stop] epoch=0/micro_step=24120/global_step=24120, RunningAvgSamplesPerSec=2.6376668788530013, CurrSamplesPerSec=2.6628624082041177, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:26:59,512] [INFO] [logging.py:96:log_dist] [Rank 0] step=24130, skipped=0, lr=[8.556617478314713e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:26:59,526] [INFO] [timer.py:259:stop] epoch=0/micro_step=24130/global_step=24130, RunningAvgSamplesPerSec=2.637664139190904, CurrSamplesPerSec=2.579379114941069, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:27:14,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=24140, skipped=0, lr=[8.555433046656816e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:27:14,872] [INFO] [timer.py:259:stop] epoch=0/micro_step=24140/global_step=24140, RunningAvgSamplesPerSec=2.6376583576073545, CurrSamplesPerSec=2.6469027101329647, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:27:30,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=24150, skipped=0, lr=[8.554248211278348e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:27:30,162] [INFO] [timer.py:259:stop] epoch=0/micro_step=24150/global_step=24150, RunningAvgSamplesPerSec=2.637656690778384, CurrSamplesPerSec=2.622242572451127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:27:45,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=24160, skipped=0, lr=[8.55306297231385e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:27:45,512] [INFO] [timer.py:259:stop] epoch=0/micro_step=24160/global_step=24160, RunningAvgSamplesPerSec=2.637653322400063, CurrSamplesPerSec=2.644029433066429, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:28:00,861] [INFO] [logging.py:96:log_dist] [Rank 0] step=24170, skipped=0, lr=[8.551877329897905e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:28:00,863] [INFO] [timer.py:259:stop] epoch=0/micro_step=24170/global_step=24170, RunningAvgSamplesPerSec=2.637648429337266, CurrSamplesPerSec=2.6337022732185122, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:28:16,194] [INFO] [logging.py:96:log_dist] [Rank 0] step=24180, skipped=0, lr=[8.550691284165142e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:28:16,196] [INFO] [timer.py:259:stop] epoch=0/micro_step=24180/global_step=24180, RunningAvgSamplesPerSec=2.6376434312591535, CurrSamplesPerSec=2.635480856041245, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:28:31,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=24190, skipped=0, lr=[8.549504835250239e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:28:31,592] [INFO] [timer.py:259:stop] epoch=0/micro_step=24190/global_step=24190, RunningAvgSamplesPerSec=2.637634498963769, CurrSamplesPerSec=2.6232089468929374, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:28:46,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=24200, skipped=0, lr=[8.548317983287915e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:28:46,866] [INFO] [timer.py:259:stop] epoch=0/micro_step=24200/global_step=24200, RunningAvgSamplesPerSec=2.6376335085424314, CurrSamplesPerSec=2.638639313050682, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:29:02,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=24210, skipped=0, lr=[8.54713072841294e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:29:02,181] [INFO] [timer.py:259:stop] epoch=0/micro_step=24210/global_step=24210, RunningAvgSamplesPerSec=2.6376296038451423, CurrSamplesPerSec=2.633228141850845, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:29:17,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=24220, skipped=0, lr=[8.545943070760126e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:29:17,476] [INFO] [timer.py:259:stop] epoch=0/micro_step=24220/global_step=24220, RunningAvgSamplesPerSec=2.63762779565031, CurrSamplesPerSec=2.6439160983499144, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:29:32,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=24230, skipped=0, lr=[8.544755010464332e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:29:32,789] [INFO] [timer.py:259:stop] epoch=0/micro_step=24230/global_step=24230, RunningAvgSamplesPerSec=2.6376236147966896, CurrSamplesPerSec=2.6184945233035593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:29:48,145] [INFO] [logging.py:96:log_dist] [Rank 0] step=24240, skipped=0, lr=[8.543566547660463e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:29:48,148] [INFO] [timer.py:259:stop] epoch=0/micro_step=24240/global_step=24240, RunningAvgSamplesPerSec=2.6376173568981147, CurrSamplesPerSec=2.6092396867792846, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:30:03,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=24250, skipped=0, lr=[8.542377682483467e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:30:03,482] [INFO] [timer.py:259:stop] epoch=0/micro_step=24250/global_step=24250, RunningAvgSamplesPerSec=2.63761286903222, CurrSamplesPerSec=2.624255254167472, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:30:18,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=24260, skipped=0, lr=[8.541188415068343e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:30:18,774] [INFO] [timer.py:259:stop] epoch=0/micro_step=24260/global_step=24260, RunningAvgSamplesPerSec=2.6376106373876795, CurrSamplesPerSec=2.590704051359402, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:30:34,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=24270, skipped=0, lr=[8.539998745550134e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:30:34,083] [INFO] [timer.py:259:stop] epoch=0/micro_step=24270/global_step=24270, RunningAvgSamplesPerSec=2.637607794554601, CurrSamplesPerSec=2.6425505958447184, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:30:49,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=24280, skipped=0, lr=[8.53880867406392e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:30:49,487] [INFO] [timer.py:259:stop] epoch=0/micro_step=24280/global_step=24280, RunningAvgSamplesPerSec=2.6375984517983246, CurrSamplesPerSec=2.577848107450772, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:31:04,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=24290, skipped=0, lr=[8.537618200744844e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:31:04,816] [INFO] [timer.py:259:stop] epoch=0/micro_step=24290/global_step=24290, RunningAvgSamplesPerSec=2.637594922476925, CurrSamplesPerSec=2.6120313023217885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:31:20,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=24300, skipped=0, lr=[8.536427325728077e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:31:20,119] [INFO] [timer.py:259:stop] epoch=0/micro_step=24300/global_step=24300, RunningAvgSamplesPerSec=2.6375926005519785, CurrSamplesPerSec=2.6297713645645024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:31:35,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=24310, skipped=0, lr=[8.535236049148847e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:31:35,439] [INFO] [timer.py:259:stop] epoch=0/micro_step=24310/global_step=24310, RunningAvgSamplesPerSec=2.6375898108136746, CurrSamplesPerSec=2.6518001305800976, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:31:50,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=24320, skipped=0, lr=[8.534044371142423e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:31:50,799] [INFO] [timer.py:259:stop] epoch=0/micro_step=24320/global_step=24320, RunningAvgSamplesPerSec=2.6375830542060057, CurrSamplesPerSec=2.6495537548625103, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:32:06,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=24330, skipped=0, lr=[8.532852291844117e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:32:06,095] [INFO] [timer.py:259:stop] epoch=0/micro_step=24330/global_step=24330, RunningAvgSamplesPerSec=2.637579730359682, CurrSamplesPerSec=2.6564326080149736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:32:21,425] [INFO] [logging.py:96:log_dist] [Rank 0] step=24340, skipped=0, lr=[8.531659811389297e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:32:21,427] [INFO] [timer.py:259:stop] epoch=0/micro_step=24340/global_step=24340, RunningAvgSamplesPerSec=2.637575591624798, CurrSamplesPerSec=2.6315297809339966, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:32:36,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=24350, skipped=0, lr=[8.530466929913365e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:32:36,737] [INFO] [timer.py:259:stop] epoch=0/micro_step=24350/global_step=24350, RunningAvgSamplesPerSec=2.6375720878246076, CurrSamplesPerSec=2.6492755263640615, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:32:52,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=24360, skipped=0, lr=[8.529273647551773e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:32:52,101] [INFO] [timer.py:259:stop] epoch=0/micro_step=24360/global_step=24360, RunningAvgSamplesPerSec=2.637565795792914, CurrSamplesPerSec=2.6365175044338094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:33:07,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=24370, skipped=0, lr=[8.52807996444002e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:33:07,434] [INFO] [timer.py:259:stop] epoch=0/micro_step=24370/global_step=24370, RunningAvgSamplesPerSec=2.63755989461252, CurrSamplesPerSec=2.644159863349134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:33:22,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=24380, skipped=0, lr=[8.526885880713647e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:33:22,751] [INFO] [timer.py:259:stop] epoch=0/micro_step=24380/global_step=24380, RunningAvgSamplesPerSec=2.63755558243883, CurrSamplesPerSec=2.638581215391476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:33:38,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=24390, skipped=0, lr=[8.525691396508244e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:33:38,096] [INFO] [timer.py:259:stop] epoch=0/micro_step=24390/global_step=24390, RunningAvgSamplesPerSec=2.6375487756980904, CurrSamplesPerSec=2.560090942955597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:33:53,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=24400, skipped=0, lr=[8.524496511959448e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:33:53,479] [INFO] [timer.py:259:stop] epoch=0/micro_step=24400/global_step=24400, RunningAvgSamplesPerSec=2.6375410335518645, CurrSamplesPerSec=2.6472376642993467, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:34:08,830] [INFO] [logging.py:96:log_dist] [Rank 0] step=24410, skipped=0, lr=[8.523301227202934e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:34:08,849] [INFO] [timer.py:259:stop] epoch=0/micro_step=24410/global_step=24410, RunningAvgSamplesPerSec=2.63753306663845, CurrSamplesPerSec=2.6287569007268075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:34:24,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=24420, skipped=0, lr=[8.522105542374428e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:34:24,204] [INFO] [timer.py:259:stop] epoch=0/micro_step=24420/global_step=24420, RunningAvgSamplesPerSec=2.6375272800888805, CurrSamplesPerSec=2.561954149016265, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:34:39,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=24430, skipped=0, lr=[8.5209094576097e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:34:39,557] [INFO] [timer.py:259:stop] epoch=0/micro_step=24430/global_step=24430, RunningAvgSamplesPerSec=2.6375212107797448, CurrSamplesPerSec=2.5943949751154496, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:34:54,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=24440, skipped=0, lr=[8.519712973044569e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:34:54,887] [INFO] [timer.py:259:stop] epoch=0/micro_step=24440/global_step=24440, RunningAvgSamplesPerSec=2.6375168810513046, CurrSamplesPerSec=2.613240877409136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:35:10,234] [INFO] [logging.py:96:log_dist] [Rank 0] step=24450, skipped=0, lr=[8.518516088814895e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:35:10,258] [INFO] [timer.py:259:stop] epoch=0/micro_step=24450/global_step=24450, RunningAvgSamplesPerSec=2.637510361664604, CurrSamplesPerSec=2.6109605856432028, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:35:25,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=24460, skipped=0, lr=[8.517318805056581e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:35:25,554] [INFO] [timer.py:259:stop] epoch=0/micro_step=24460/global_step=24460, RunningAvgSamplesPerSec=2.6375074961252247, CurrSamplesPerSec=2.6428590535516543, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:35:40,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=24470, skipped=0, lr=[8.516121121905584e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:35:40,864] [INFO] [timer.py:259:stop] epoch=0/micro_step=24470/global_step=24470, RunningAvgSamplesPerSec=2.6375050126611432, CurrSamplesPerSec=2.650128806116175, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:35:56,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=24480, skipped=0, lr=[8.514923039497897e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:35:56,161] [INFO] [timer.py:259:stop] epoch=0/micro_step=24480/global_step=24480, RunningAvgSamplesPerSec=2.6375020338486173, CurrSamplesPerSec=2.6544971186047674, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:36:11,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=24490, skipped=0, lr=[8.513724557969564e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:36:11,518] [INFO] [timer.py:259:stop] epoch=0/micro_step=24490/global_step=24490, RunningAvgSamplesPerSec=2.6374952676995065, CurrSamplesPerSec=2.6331421799051538, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:36:26,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=24500, skipped=0, lr=[8.512525677456675e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:36:26,830] [INFO] [timer.py:259:stop] epoch=0/micro_step=24500/global_step=24500, RunningAvgSamplesPerSec=2.637491277391318, CurrSamplesPerSec=2.518851812069227, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:36:42,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=24510, skipped=0, lr=[8.511326398095362e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:36:42,095] [INFO] [timer.py:259:stop] epoch=0/micro_step=24510/global_step=24510, RunningAvgSamplesPerSec=2.637490798961468, CurrSamplesPerSec=2.642322524984355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:36:57,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=24520, skipped=0, lr=[8.510126720021801e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:36:57,474] [INFO] [timer.py:259:stop] epoch=0/micro_step=24520/global_step=24520, RunningAvgSamplesPerSec=2.637483276254311, CurrSamplesPerSec=2.615443199733755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:37:12,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=24530, skipped=0, lr=[8.508926643372222e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:37:12,925] [INFO] [timer.py:259:stop] epoch=0/micro_step=24530/global_step=24530, RunningAvgSamplesPerSec=2.6374717076554623, CurrSamplesPerSec=2.6071227433574347, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:37:28,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=24540, skipped=0, lr=[8.507726168282888e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:37:28,277] [INFO] [timer.py:259:stop] epoch=0/micro_step=24540/global_step=24540, RunningAvgSamplesPerSec=2.637466547407769, CurrSamplesPerSec=2.6354953461040527, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:37:43,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=24550, skipped=0, lr=[8.506525294890115e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:37:43,663] [INFO] [timer.py:259:stop] epoch=0/micro_step=24550/global_step=24550, RunningAvgSamplesPerSec=2.63745775207997, CurrSamplesPerSec=2.6471913002517526, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:37:58,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=24560, skipped=0, lr=[8.505324023330264e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:37:58,966] [INFO] [timer.py:259:stop] epoch=0/micro_step=24560/global_step=24560, RunningAvgSamplesPerSec=2.6374552036038583, CurrSamplesPerSec=2.6001499743243657, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:38:14,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=24570, skipped=0, lr=[8.504122353739738e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:38:14,273] [INFO] [timer.py:259:stop] epoch=0/micro_step=24570/global_step=24570, RunningAvgSamplesPerSec=2.6374511524582713, CurrSamplesPerSec=2.6160463687275572, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:38:29,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=24580, skipped=0, lr=[8.50292028625499e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:38:29,534] [INFO] [timer.py:259:stop] epoch=0/micro_step=24580/global_step=24580, RunningAvgSamplesPerSec=2.6374516440989146, CurrSamplesPerSec=2.6101059435264737, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:38:44,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=24590, skipped=0, lr=[8.501717821012512e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:38:44,997] [INFO] [timer.py:259:stop] epoch=0/micro_step=24590/global_step=24590, RunningAvgSamplesPerSec=2.637439066118316, CurrSamplesPerSec=2.6295026326155577, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:39:00,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=24600, skipped=0, lr=[8.500514958148844e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:39:00,288] [INFO] [timer.py:259:stop] epoch=0/micro_step=24600/global_step=24600, RunningAvgSamplesPerSec=2.6374369115861054, CurrSamplesPerSec=2.6539252080001945, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:39:15,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=24610, skipped=0, lr=[8.499311697800577e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:39:15,539] [INFO] [timer.py:259:stop] epoch=0/micro_step=24610/global_step=24610, RunningAvgSamplesPerSec=2.637437242229768, CurrSamplesPerSec=2.6137970148115004, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:39:30,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=24620, skipped=0, lr=[8.498108040104333e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:39:30,906] [INFO] [timer.py:259:stop] epoch=0/micro_step=24620/global_step=24620, RunningAvgSamplesPerSec=2.6374297534615043, CurrSamplesPerSec=2.637797562048117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:39:46,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=24630, skipped=0, lr=[8.496903985196794e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:39:46,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=24630/global_step=24630, RunningAvgSamplesPerSec=2.637428192806302, CurrSamplesPerSec=2.6348057957832176, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:40:01,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=24640, skipped=0, lr=[8.495699533214679e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:40:01,569] [INFO] [timer.py:259:stop] epoch=0/micro_step=24640/global_step=24640, RunningAvgSamplesPerSec=2.637418557733921, CurrSamplesPerSec=2.628146210171718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:40:16,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=24650, skipped=0, lr=[8.494494684294753e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:40:16,870] [INFO] [timer.py:259:stop] epoch=0/micro_step=24650/global_step=24650, RunningAvgSamplesPerSec=2.6374167127859645, CurrSamplesPerSec=2.651477011104138, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:40:32,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=24660, skipped=0, lr=[8.493289438573828e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:40:32,231] [INFO] [timer.py:259:stop] epoch=0/micro_step=24660/global_step=24660, RunningAvgSamplesPerSec=2.6374108067863955, CurrSamplesPerSec=2.6329772974581087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:40:47,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=24670, skipped=0, lr=[8.492083796188761e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:40:47,499] [INFO] [timer.py:259:stop] epoch=0/micro_step=24670/global_step=24670, RunningAvgSamplesPerSec=2.6374101014256977, CurrSamplesPerSec=2.635342173500657, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:41:02,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=24680, skipped=0, lr=[8.49087775727645e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:41:02,779] [INFO] [timer.py:259:stop] epoch=0/micro_step=24680/global_step=24680, RunningAvgSamplesPerSec=2.6374086657095694, CurrSamplesPerSec=2.6366157031801674, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:41:18,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=24690, skipped=0, lr=[8.489671321973845e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:41:18,150] [INFO] [timer.py:259:stop] epoch=0/micro_step=24690/global_step=24690, RunningAvgSamplesPerSec=2.6374016496380084, CurrSamplesPerSec=2.615596922278319, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:41:33,452] [INFO] [logging.py:96:log_dist] [Rank 0] step=24700, skipped=0, lr=[8.488464490417934e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:41:33,454] [INFO] [timer.py:259:stop] epoch=0/micro_step=24700/global_step=24700, RunningAvgSamplesPerSec=2.637398196451752, CurrSamplesPerSec=2.631870351282342, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:41:48,791] [INFO] [logging.py:96:log_dist] [Rank 0] step=24710, skipped=0, lr=[8.487257262745756e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:41:48,800] [INFO] [timer.py:259:stop] epoch=0/micro_step=24710/global_step=24710, RunningAvgSamplesPerSec=2.6373926139670214, CurrSamplesPerSec=2.576496958119652, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:42:04,118] [INFO] [logging.py:96:log_dist] [Rank 0] step=24720, skipped=0, lr=[8.486049639094389e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:42:04,124] [INFO] [timer.py:259:stop] epoch=0/micro_step=24720/global_step=24720, RunningAvgSamplesPerSec=2.6373886251619, CurrSamplesPerSec=2.6294882083740254, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:42:19,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=24730, skipped=0, lr=[8.484841619600961e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:42:19,540] [INFO] [timer.py:259:stop] epoch=0/micro_step=24730/global_step=24730, RunningAvgSamplesPerSec=2.6373784040264834, CurrSamplesPerSec=2.585519381533468, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:42:34,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=24740, skipped=0, lr=[8.483633204402643e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:42:34,780] [INFO] [timer.py:259:stop] epoch=0/micro_step=24740/global_step=24740, RunningAvgSamplesPerSec=2.6373795820567842, CurrSamplesPerSec=2.6235994706618087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:42:50,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=24750, skipped=0, lr=[8.48242439363665e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:42:50,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=24750/global_step=24750, RunningAvgSamplesPerSec=2.6373747217498034, CurrSamplesPerSec=2.6399505103435508, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:43:05,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=24760, skipped=0, lr=[8.481215187440243e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:43:05,416] [INFO] [timer.py:259:stop] epoch=0/micro_step=24760/global_step=24760, RunningAvgSamplesPerSec=2.637371421410217, CurrSamplesPerSec=2.63518736307043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:43:20,791] [INFO] [logging.py:96:log_dist] [Rank 0] step=24770, skipped=0, lr=[8.480005585950729e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:43:20,796] [INFO] [timer.py:259:stop] epoch=0/micro_step=24770/global_step=24770, RunningAvgSamplesPerSec=2.6373637685361917, CurrSamplesPerSec=2.6066002199862086, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:43:36,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=24780, skipped=0, lr=[8.478795589305455e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:43:36,018] [INFO] [timer.py:259:stop] epoch=0/micro_step=24780/global_step=24780, RunningAvgSamplesPerSec=2.637365452793601, CurrSamplesPerSec=2.6367342143612076, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:43:51,304] [INFO] [logging.py:96:log_dist] [Rank 0] step=24790, skipped=0, lr=[8.477585197641822e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:43:51,308] [INFO] [timer.py:259:stop] epoch=0/micro_step=24790/global_step=24790, RunningAvgSamplesPerSec=2.6373640530160487, CurrSamplesPerSec=2.6314422789092604, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:44:06,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=24800, skipped=0, lr=[8.476374411097266e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:44:06,655] [INFO] [timer.py:259:stop] epoch=0/micro_step=24800/global_step=24800, RunningAvgSamplesPerSec=2.637359902536545, CurrSamplesPerSec=2.636072594184618, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:44:21,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=24810, skipped=0, lr=[8.475163229809273e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:44:21,906] [INFO] [timer.py:259:stop] epoch=0/micro_step=24810/global_step=24810, RunningAvgSamplesPerSec=2.637360778337167, CurrSamplesPerSec=2.647045117942205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:44:37,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=24820, skipped=0, lr=[8.473951653915376e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:44:37,204] [INFO] [timer.py:259:stop] epoch=0/micro_step=24820/global_step=24820, RunningAvgSamplesPerSec=2.6373581840763727, CurrSamplesPerSec=2.6346225006909276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:44:52,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=24830, skipped=0, lr=[8.472739683553147e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:44:52,565] [INFO] [timer.py:259:stop] epoch=0/micro_step=24830/global_step=24830, RunningAvgSamplesPerSec=2.6373517523783168, CurrSamplesPerSec=2.6438361032231743, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:45:07,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=24840, skipped=0, lr=[8.471527318860207e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:45:07,800] [INFO] [timer.py:259:stop] epoch=0/micro_step=24840/global_step=24840, RunningAvgSamplesPerSec=2.6373530206150386, CurrSamplesPerSec=2.6587627163422933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:45:23,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=24850, skipped=0, lr=[8.470314559974217e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:45:23,073] [INFO] [timer.py:259:stop] epoch=0/micro_step=24850/global_step=24850, RunningAvgSamplesPerSec=2.63735137078418, CurrSamplesPerSec=2.665696732556604, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:45:38,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=24860, skipped=0, lr=[8.46910140703289e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:45:38,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=24860/global_step=24860, RunningAvgSamplesPerSec=2.637348498119623, CurrSamplesPerSec=2.5841382891138114, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:45:53,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=24870, skipped=0, lr=[8.467887860173981e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:45:53,637] [INFO] [timer.py:259:stop] epoch=0/micro_step=24870/global_step=24870, RunningAvgSamplesPerSec=2.6373477497820392, CurrSamplesPerSec=2.656895357604084, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:46:08,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=24880, skipped=0, lr=[8.466673919535284e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:46:08,917] [INFO] [timer.py:259:stop] epoch=0/micro_step=24880/global_step=24880, RunningAvgSamplesPerSec=2.637346271596715, CurrSamplesPerSec=2.5903368562955382, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:46:24,193] [INFO] [logging.py:96:log_dist] [Rank 0] step=24890, skipped=0, lr=[8.465459585254643e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:46:24,214] [INFO] [timer.py:259:stop] epoch=0/micro_step=24890/global_step=24890, RunningAvgSamplesPerSec=2.637342918540349, CurrSamplesPerSec=2.640366396212326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:46:39,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=24900, skipped=0, lr=[8.464244857469951e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:46:39,519] [INFO] [timer.py:259:stop] epoch=0/micro_step=24900/global_step=24900, RunningAvgSamplesPerSec=2.6373396997810414, CurrSamplesPerSec=2.572301724370837, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:46:54,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=24910, skipped=0, lr=[8.463029736319135e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:46:54,749] [INFO] [timer.py:259:stop] epoch=0/micro_step=24910/global_step=24910, RunningAvgSamplesPerSec=2.637341781172236, CurrSamplesPerSec=2.635391021208049, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:47:10,088] [INFO] [logging.py:96:log_dist] [Rank 0] step=24920, skipped=0, lr=[8.461814221940177e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:47:10,090] [INFO] [timer.py:259:stop] epoch=0/micro_step=24920/global_step=24920, RunningAvgSamplesPerSec=2.6373368781233606, CurrSamplesPerSec=2.6329454804154597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:47:25,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=24930, skipped=0, lr=[8.460598314471096e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:47:25,435] [INFO] [timer.py:259:stop] epoch=0/micro_step=24930/global_step=24930, RunningAvgSamplesPerSec=2.6373318795555605, CurrSamplesPerSec=2.6574348746980627, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:47:40,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=24940, skipped=0, lr=[8.459382014049959e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:47:40,817] [INFO] [timer.py:259:stop] epoch=0/micro_step=24940/global_step=24940, RunningAvgSamplesPerSec=2.637324029741812, CurrSamplesPerSec=2.6132771046298675, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:47:56,181] [INFO] [logging.py:96:log_dist] [Rank 0] step=24950, skipped=0, lr=[8.45816532081488e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:47:56,182] [INFO] [timer.py:259:stop] epoch=0/micro_step=24950/global_step=24950, RunningAvgSamplesPerSec=2.6373178145459732, CurrSamplesPerSec=2.6102436070808817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:48:11,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=24960, skipped=0, lr=[8.45694823490401e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:48:11,556] [INFO] [timer.py:259:stop] epoch=0/micro_step=24960/global_step=24960, RunningAvgSamplesPerSec=2.6373103858345446, CurrSamplesPerSec=2.6379983055243597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:48:26,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=24970, skipped=0, lr=[8.455730756455555e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:48:26,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=24970/global_step=24970, RunningAvgSamplesPerSec=2.637302314640962, CurrSamplesPerSec=2.6501187594108377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:48:42,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=24980, skipped=0, lr=[8.454512885607756e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:48:42,323] [INFO] [timer.py:259:stop] epoch=0/micro_step=24980/global_step=24980, RunningAvgSamplesPerSec=2.6372955122573427, CurrSamplesPerSec=2.5966821795094686, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:48:57,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=24990, skipped=0, lr=[8.453294622498905e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:48:57,666] [INFO] [timer.py:259:stop] epoch=0/micro_step=24990/global_step=24990, RunningAvgSamplesPerSec=2.6372892616178496, CurrSamplesPerSec=2.6405238936935063, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:49:12,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=25000, skipped=0, lr=[8.452075967267334e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:49:12,956] [INFO] [timer.py:259:stop] epoch=0/micro_step=25000/global_step=25000, RunningAvgSamplesPerSec=2.6372878616527022, CurrSamplesPerSec=2.6499977863151543, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:49:28,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=25010, skipped=0, lr=[8.450856920051423e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:49:28,260] [INFO] [timer.py:259:stop] epoch=0/micro_step=25010/global_step=25010, RunningAvgSamplesPerSec=2.6372856249220917, CurrSamplesPerSec=2.624793093488213, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:49:43,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=25020, skipped=0, lr=[8.449637480989594e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:49:43,562] [INFO] [timer.py:259:stop] epoch=0/micro_step=25020/global_step=25020, RunningAvgSamplesPerSec=2.6372823831142753, CurrSamplesPerSec=2.637660709012074, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:49:58,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=25030, skipped=0, lr=[8.448417650220317e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:49:58,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=25030/global_step=25030, RunningAvgSamplesPerSec=2.637278757714528, CurrSamplesPerSec=2.5878252840370846, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:50:14,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=25040, skipped=0, lr=[8.447197427882103e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:50:14,177] [INFO] [timer.py:259:stop] epoch=0/micro_step=25040/global_step=25040, RunningAvgSamplesPerSec=2.637275818427582, CurrSamplesPerSec=2.6387364248502894, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:50:29,554] [INFO] [logging.py:96:log_dist] [Rank 0] step=25050, skipped=0, lr=[8.445976814113511e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:50:29,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=25050/global_step=25050, RunningAvgSamplesPerSec=2.63726686362884, CurrSamplesPerSec=2.600794892458373, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:50:44,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=25060, skipped=0, lr=[8.444755809053136e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:50:44,909] [INFO] [timer.py:259:stop] epoch=0/micro_step=25060/global_step=25060, RunningAvgSamplesPerSec=2.637261225094659, CurrSamplesPerSec=2.63722411834846, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:51:00,291] [INFO] [logging.py:96:log_dist] [Rank 0] step=25070, skipped=0, lr=[8.44353441283963e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:51:00,294] [INFO] [timer.py:259:stop] epoch=0/micro_step=25070/global_step=25070, RunningAvgSamplesPerSec=2.6372554454225345, CurrSamplesPerSec=2.608194368725793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:51:15,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=25080, skipped=0, lr=[8.442312625611677e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:51:15,605] [INFO] [timer.py:259:stop] epoch=0/micro_step=25080/global_step=25080, RunningAvgSamplesPerSec=2.6372529570112477, CurrSamplesPerSec=2.620677876447459, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:51:30,932] [INFO] [logging.py:96:log_dist] [Rank 0] step=25090, skipped=0, lr=[8.441090447508017e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:51:30,934] [INFO] [timer.py:259:stop] epoch=0/micro_step=25090/global_step=25090, RunningAvgSamplesPerSec=2.637247916747491, CurrSamplesPerSec=2.627725933353435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:51:46,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=25100, skipped=0, lr=[8.439867878667426e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:51:46,274] [INFO] [timer.py:259:stop] epoch=0/micro_step=25100/global_step=25100, RunningAvgSamplesPerSec=2.6372438963389064, CurrSamplesPerSec=2.633819695675564, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:52:01,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=25110, skipped=0, lr=[8.438644919228726e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:52:01,665] [INFO] [timer.py:259:stop] epoch=0/micro_step=25110/global_step=25110, RunningAvgSamplesPerSec=2.6372353580589696, CurrSamplesPerSec=2.6430518241414735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:52:16,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=25120, skipped=0, lr=[8.437421569330786e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:52:16,990] [INFO] [timer.py:259:stop] epoch=0/micro_step=25120/global_step=25120, RunningAvgSamplesPerSec=2.637231289000107, CurrSamplesPerSec=2.6539541755187757, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:52:32,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=25130, skipped=0, lr=[8.436197829112518e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:52:32,324] [INFO] [timer.py:259:stop] epoch=0/micro_step=25130/global_step=25130, RunningAvgSamplesPerSec=2.6372258663754335, CurrSamplesPerSec=2.665203391651401, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:52:47,677] [INFO] [logging.py:96:log_dist] [Rank 0] step=25140, skipped=0, lr=[8.434973698712878e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:52:47,681] [INFO] [timer.py:259:stop] epoch=0/micro_step=25140/global_step=25140, RunningAvgSamplesPerSec=2.6372193053234603, CurrSamplesPerSec=2.6359392332176026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:53:03,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=25150, skipped=0, lr=[8.433749178270865e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:53:03,035] [INFO] [timer.py:259:stop] epoch=0/micro_step=25150/global_step=25150, RunningAvgSamplesPerSec=2.6372132140789, CurrSamplesPerSec=2.642578483121852, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:53:18,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=25160, skipped=0, lr=[8.432524267925525e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:53:18,362] [INFO] [timer.py:259:stop] epoch=0/micro_step=25160/global_step=25160, RunningAvgSamplesPerSec=2.637208521442132, CurrSamplesPerSec=2.593250478180321, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:53:33,643] [INFO] [logging.py:96:log_dist] [Rank 0] step=25170, skipped=0, lr=[8.431298967815947e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:53:33,645] [INFO] [timer.py:259:stop] epoch=0/micro_step=25170/global_step=25170, RunningAvgSamplesPerSec=2.6372071243580253, CurrSamplesPerSec=2.6477975025961333, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:53:49,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=25180, skipped=0, lr=[8.430073278081262e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:53:49,015] [INFO] [timer.py:259:stop] epoch=0/micro_step=25180/global_step=25180, RunningAvgSamplesPerSec=2.637199473115684, CurrSamplesPerSec=2.615786551899428, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:54:04,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=25190, skipped=0, lr=[8.428847198860652e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:54:04,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=25190/global_step=25190, RunningAvgSamplesPerSec=2.6371975900040323, CurrSamplesPerSec=2.6383957349889453, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:54:19,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=25200, skipped=0, lr=[8.427620730293335e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:54:19,548] [INFO] [timer.py:259:stop] epoch=0/micro_step=25200/global_step=25200, RunningAvgSamplesPerSec=2.6371979882051324, CurrSamplesPerSec=2.6001511832465787, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:54:34,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=25210, skipped=0, lr=[8.426393872518578e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:54:34,897] [INFO] [timer.py:259:stop] epoch=0/micro_step=25210/global_step=25210, RunningAvgSamplesPerSec=2.6371923910547572, CurrSamplesPerSec=2.6247536718231554, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:54:50,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=25220, skipped=0, lr=[8.425166625675692e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:54:50,240] [INFO] [timer.py:259:stop] epoch=0/micro_step=25220/global_step=25220, RunningAvgSamplesPerSec=2.6371869410825957, CurrSamplesPerSec=2.6134619197917073, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:55:05,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=25230, skipped=0, lr=[8.423938989904029e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:55:05,492] [INFO] [timer.py:259:stop] epoch=0/micro_step=25230/global_step=25230, RunningAvgSamplesPerSec=2.6371880237719982, CurrSamplesPerSec=2.6409378816722033, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:55:20,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=25240, skipped=0, lr=[8.42271096534299e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:55:20,792] [INFO] [timer.py:259:stop] epoch=0/micro_step=25240/global_step=25240, RunningAvgSamplesPerSec=2.637185442272666, CurrSamplesPerSec=2.6535331591550455, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:55:36,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=25250, skipped=0, lr=[8.421482552132014e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:55:36,020] [INFO] [timer.py:259:stop] epoch=0/micro_step=25250/global_step=25250, RunningAvgSamplesPerSec=2.637188298640334, CurrSamplesPerSec=2.633800262362835, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:55:51,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=25260, skipped=0, lr=[8.420253750410592e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:55:51,339] [INFO] [timer.py:259:stop] epoch=0/micro_step=25260/global_step=25260, RunningAvgSamplesPerSec=2.6371852103846503, CurrSamplesPerSec=2.6432371269149444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:56:06,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=25270, skipped=0, lr=[8.419024560318252e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:56:06,622] [INFO] [timer.py:259:stop] epoch=0/micro_step=25270/global_step=25270, RunningAvgSamplesPerSec=2.6371833668384803, CurrSamplesPerSec=2.6306942091045897, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:56:21,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=25280, skipped=0, lr=[8.417794981994572e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:56:21,980] [INFO] [timer.py:259:stop] epoch=0/micro_step=25280/global_step=25280, RunningAvgSamplesPerSec=2.6371768199371495, CurrSamplesPerSec=2.6252789806058257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:56:37,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=25290, skipped=0, lr=[8.416565015579166e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:56:37,331] [INFO] [timer.py:259:stop] epoch=0/micro_step=25290/global_step=25290, RunningAvgSamplesPerSec=2.6371718007711267, CurrSamplesPerSec=2.640990262895347, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:56:52,685] [INFO] [logging.py:96:log_dist] [Rank 0] step=25300, skipped=0, lr=[8.4153346612117e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:56:52,688] [INFO] [timer.py:259:stop] epoch=0/micro_step=25300/global_step=25300, RunningAvgSamplesPerSec=2.6371656381404436, CurrSamplesPerSec=2.5995875444664054, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:57:08,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=25310, skipped=0, lr=[8.414103919031882e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:57:08,041] [INFO] [timer.py:259:stop] epoch=0/micro_step=25310/global_step=25310, RunningAvgSamplesPerSec=2.637160277277702, CurrSamplesPerSec=2.571389039615133, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:57:23,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=25320, skipped=0, lr=[8.412872789179462e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:57:23,376] [INFO] [timer.py:259:stop] epoch=0/micro_step=25320/global_step=25320, RunningAvgSamplesPerSec=2.6371570011602836, CurrSamplesPerSec=2.6418989503823935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:57:38,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=25330, skipped=0, lr=[8.411641271794236e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:57:38,719] [INFO] [timer.py:259:stop] epoch=0/micro_step=25330/global_step=25330, RunningAvgSamplesPerSec=2.637151966888757, CurrSamplesPerSec=2.625751075329003, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:57:54,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=25340, skipped=0, lr=[8.410409367016042e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:57:54,048] [INFO] [timer.py:259:stop] epoch=0/micro_step=25340/global_step=25340, RunningAvgSamplesPerSec=2.637147831651866, CurrSamplesPerSec=2.634923730487653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:58:09,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=25350, skipped=0, lr=[8.409177074984765e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:58:09,383] [INFO] [timer.py:259:stop] epoch=0/micro_step=25350/global_step=25350, RunningAvgSamplesPerSec=2.637142144939364, CurrSamplesPerSec=2.590328457626251, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:58:24,696] [INFO] [logging.py:96:log_dist] [Rank 0] step=25360, skipped=0, lr=[8.407944395840328e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:58:24,697] [INFO] [timer.py:259:stop] epoch=0/micro_step=25360/global_step=25360, RunningAvgSamplesPerSec=2.637138934818827, CurrSamplesPerSec=2.636076321858919, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:58:40,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=25370, skipped=0, lr=[8.406711329722708e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:58:40,022] [INFO] [timer.py:259:stop] epoch=0/micro_step=25370/global_step=25370, RunningAvgSamplesPerSec=2.637134250797649, CurrSamplesPerSec=2.592774770589205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:58:55,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=25380, skipped=0, lr=[8.405477876771916e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:58:55,315] [INFO] [timer.py:259:stop] epoch=0/micro_step=25380/global_step=25380, RunningAvgSamplesPerSec=2.63713156140561, CurrSamplesPerSec=2.6475171360624903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:59:10,663] [INFO] [logging.py:96:log_dist] [Rank 0] step=25390, skipped=0, lr=[8.404244037128012e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:59:10,665] [INFO] [timer.py:259:stop] epoch=0/micro_step=25390/global_step=25390, RunningAvgSamplesPerSec=2.6371252709201696, CurrSamplesPerSec=2.6447058933327323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:59:25,969] [INFO] [logging.py:96:log_dist] [Rank 0] step=25400, skipped=0, lr=[8.4030098109311e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:59:25,971] [INFO] [timer.py:259:stop] epoch=0/micro_step=25400/global_step=25400, RunningAvgSamplesPerSec=2.6371213399982016, CurrSamplesPerSec=2.6497140242648536, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:59:41,268] [INFO] [logging.py:96:log_dist] [Rank 0] step=25410, skipped=0, lr=[8.401775198321324e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:59:41,270] [INFO] [timer.py:259:stop] epoch=0/micro_step=25410/global_step=25410, RunningAvgSamplesPerSec=2.6371185645272437, CurrSamplesPerSec=2.618515366152302, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 10:59:56,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=25420, skipped=0, lr=[8.400540199438878e-06], mom=[(0.9, 0.95)] +[2024-11-01 10:59:56,723] [INFO] [timer.py:259:stop] epoch=0/micro_step=25420/global_step=25420, RunningAvgSamplesPerSec=2.6371061870243593, CurrSamplesPerSec=2.5964912911781806, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:00:12,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=25430, skipped=0, lr=[8.399304814423993e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:00:12,033] [INFO] [timer.py:259:stop] epoch=0/micro_step=25430/global_step=25430, RunningAvgSamplesPerSec=2.6371033622164126, CurrSamplesPerSec=2.6302562101799185, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:00:27,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=25440, skipped=0, lr=[8.398069043416953e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:00:27,455] [INFO] [timer.py:259:stop] epoch=0/micro_step=25440/global_step=25440, RunningAvgSamplesPerSec=2.6370922984402103, CurrSamplesPerSec=2.619105233709229, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:00:42,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=25450, skipped=0, lr=[8.396832886558073e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:00:42,889] [INFO] [timer.py:259:stop] epoch=0/micro_step=25450/global_step=25450, RunningAvgSamplesPerSec=2.6370833120730293, CurrSamplesPerSec=2.635680832981433, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:00:58,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=25460, skipped=0, lr=[8.395596343987724e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:00:58,340] [INFO] [timer.py:259:stop] epoch=0/micro_step=25460/global_step=25460, RunningAvgSamplesPerSec=2.637072067014248, CurrSamplesPerSec=2.621442787616046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:01:13,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=25470, skipped=0, lr=[8.394359415846314e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:01:13,682] [INFO] [timer.py:259:stop] epoch=0/micro_step=25470/global_step=25470, RunningAvgSamplesPerSec=2.6370680226717953, CurrSamplesPerSec=2.637520553046733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:01:28,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=25480, skipped=0, lr=[8.393122102274296e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:01:28,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=25480/global_step=25480, RunningAvgSamplesPerSec=2.637065879129526, CurrSamplesPerSec=2.5562293143512855, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:01:44,416] [INFO] [logging.py:96:log_dist] [Rank 0] step=25490, skipped=0, lr=[8.39188440341217e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:01:44,419] [INFO] [timer.py:259:stop] epoch=0/micro_step=25490/global_step=25490, RunningAvgSamplesPerSec=2.637056011060843, CurrSamplesPerSec=2.6343502945505906, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:01:59,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=25500, skipped=0, lr=[8.390646319400474e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:01:59,781] [INFO] [timer.py:259:stop] epoch=0/micro_step=25500/global_step=25500, RunningAvgSamplesPerSec=2.6370498166660417, CurrSamplesPerSec=2.531833585359681, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:02:15,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=25510, skipped=0, lr=[8.389407850379795e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:02:15,098] [INFO] [timer.py:259:stop] epoch=0/micro_step=25510/global_step=25510, RunningAvgSamplesPerSec=2.6370464300905416, CurrSamplesPerSec=2.6288207451843486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:02:30,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=25520, skipped=0, lr=[8.388168996490762e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:02:30,559] [INFO] [timer.py:259:stop] epoch=0/micro_step=25520/global_step=25520, RunningAvgSamplesPerSec=2.6370327226802943, CurrSamplesPerSec=2.6055594432075284, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:02:45,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=25530, skipped=0, lr=[8.386929757874044e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:02:45,909] [INFO] [timer.py:259:stop] epoch=0/micro_step=25530/global_step=25530, RunningAvgSamplesPerSec=2.637026873196256, CurrSamplesPerSec=2.6468008206942892, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:03:01,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=25540, skipped=0, lr=[8.385690134670359e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:03:01,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=25540/global_step=25540, RunningAvgSamplesPerSec=2.637023783814707, CurrSamplesPerSec=2.6466225330505155, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:03:16,528] [INFO] [logging.py:96:log_dist] [Rank 0] step=25550, skipped=0, lr=[8.384450127020467e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:03:16,530] [INFO] [timer.py:259:stop] epoch=0/micro_step=25550/global_step=25550, RunningAvgSamplesPerSec=2.6370221503097433, CurrSamplesPerSec=2.633250459812158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:03:31,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=25560, skipped=0, lr=[8.383209735065172e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:03:31,829] [INFO] [timer.py:259:stop] epoch=0/micro_step=25560/global_step=25560, RunningAvgSamplesPerSec=2.63702022696721, CurrSamplesPerSec=2.639891524157603, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:03:47,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=25570, skipped=0, lr=[8.381968958945318e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:03:47,198] [INFO] [timer.py:259:stop] epoch=0/micro_step=25570/global_step=25570, RunningAvgSamplesPerSec=2.637014885459719, CurrSamplesPerSec=2.6328347465742246, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:04:02,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=25580, skipped=0, lr=[8.380727798801798e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:04:02,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=25580/global_step=25580, RunningAvgSamplesPerSec=2.6370118562959344, CurrSamplesPerSec=2.628044524608475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:04:17,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=25590, skipped=0, lr=[8.379486254775547e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:04:17,790] [INFO] [timer.py:259:stop] epoch=0/micro_step=25590/global_step=25590, RunningAvgSamplesPerSec=2.637009834233279, CurrSamplesPerSec=2.6069922954245084, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:04:33,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=25600, skipped=0, lr=[8.378244327007538e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:04:33,112] [INFO] [timer.py:259:stop] epoch=0/micro_step=25600/global_step=25600, RunningAvgSamplesPerSec=2.6370064672063034, CurrSamplesPerSec=2.636665841225829, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:04:48,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=25610, skipped=0, lr=[8.377002015638798e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:04:48,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=25610/global_step=25610, RunningAvgSamplesPerSec=2.637001490193932, CurrSamplesPerSec=2.5953569889901407, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:05:03,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=25620, skipped=0, lr=[8.375759320810388e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:05:03,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=25620/global_step=25620, RunningAvgSamplesPerSec=2.6369918388789833, CurrSamplesPerSec=2.6189723574699455, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:05:19,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=25630, skipped=0, lr=[8.37451624266342e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:05:19,243] [INFO] [timer.py:259:stop] epoch=0/micro_step=25630/global_step=25630, RunningAvgSamplesPerSec=2.6369835542313997, CurrSamplesPerSec=2.5475181119434414, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:05:34,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=25640, skipped=0, lr=[8.373272781339042e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:05:34,576] [INFO] [timer.py:259:stop] epoch=0/micro_step=25640/global_step=25640, RunningAvgSamplesPerSec=2.6369799972693393, CurrSamplesPerSec=2.6199895106389635, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:05:49,934] [INFO] [logging.py:96:log_dist] [Rank 0] step=25650, skipped=0, lr=[8.37202893697845e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:05:49,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=25650/global_step=25650, RunningAvgSamplesPerSec=2.6369735752256833, CurrSamplesPerSec=2.5730804823900573, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:06:05,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=25660, skipped=0, lr=[8.370784709722885e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:06:05,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=25660/global_step=25660, RunningAvgSamplesPerSec=2.6369681613359814, CurrSamplesPerSec=2.6435640726884793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:06:20,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=25670, skipped=0, lr=[8.36954009971363e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:06:20,602] [INFO] [timer.py:259:stop] epoch=0/micro_step=25670/global_step=25670, RunningAvgSamplesPerSec=2.6369652333941924, CurrSamplesPerSec=2.6478814987643853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:06:35,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=25680, skipped=0, lr=[8.368295107092006e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:06:35,968] [INFO] [timer.py:259:stop] epoch=0/micro_step=25680/global_step=25680, RunningAvgSamplesPerSec=2.636959421869228, CurrSamplesPerSec=2.6441423607745507, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:06:51,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=25690, skipped=0, lr=[8.367049731999389e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:06:51,305] [INFO] [timer.py:259:stop] epoch=0/micro_step=25690/global_step=25690, RunningAvgSamplesPerSec=2.636955884694206, CurrSamplesPerSec=2.6453080381211582, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:07:06,656] [INFO] [logging.py:96:log_dist] [Rank 0] step=25700, skipped=0, lr=[8.365803974577185e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:07:06,659] [INFO] [timer.py:259:stop] epoch=0/micro_step=25700/global_step=25700, RunningAvgSamplesPerSec=2.636950136723044, CurrSamplesPerSec=2.6511405644062735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:07:22,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=25710, skipped=0, lr=[8.364557834966855e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:07:22,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=25710/global_step=25710, RunningAvgSamplesPerSec=2.636943660657755, CurrSamplesPerSec=2.618508009814857, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:07:37,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=25720, skipped=0, lr=[8.363311313309898e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:07:37,382] [INFO] [timer.py:259:stop] epoch=0/micro_step=25720/global_step=25720, RunningAvgSamplesPerSec=2.6369387321034647, CurrSamplesPerSec=2.6279766013892423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:07:52,675] [INFO] [logging.py:96:log_dist] [Rank 0] step=25730, skipped=0, lr=[8.362064409747854e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:07:52,714] [INFO] [timer.py:259:stop] epoch=0/micro_step=25730/global_step=25730, RunningAvgSamplesPerSec=2.636933427657995, CurrSamplesPerSec=2.6087503882457637, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:08:08,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=25740, skipped=0, lr=[8.360817124422312e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:08:08,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=25740/global_step=25740, RunningAvgSamplesPerSec=2.636920168690609, CurrSamplesPerSec=2.5958099483093946, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:08:23,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=25750, skipped=0, lr=[8.359569457474903e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:08:23,497] [INFO] [timer.py:259:stop] epoch=0/micro_step=25750/global_step=25750, RunningAvgSamplesPerSec=2.6369163197874403, CurrSamplesPerSec=2.6326690763920655, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:08:38,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=25760, skipped=0, lr=[8.358321409047296e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:08:38,920] [INFO] [timer.py:259:stop] epoch=0/micro_step=25760/global_step=25760, RunningAvgSamplesPerSec=2.6369056744725228, CurrSamplesPerSec=2.590257271092818, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:08:54,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=25770, skipped=0, lr=[8.35707297928121e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:08:54,209] [INFO] [timer.py:259:stop] epoch=0/micro_step=25770/global_step=25770, RunningAvgSamplesPerSec=2.636905042099517, CurrSamplesPerSec=2.6423807875184027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:09:09,528] [INFO] [logging.py:96:log_dist] [Rank 0] step=25780, skipped=0, lr=[8.355824168318402e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:09:09,535] [INFO] [timer.py:259:stop] epoch=0/micro_step=25780/global_step=25780, RunningAvgSamplesPerSec=2.6369014807478135, CurrSamplesPerSec=2.577513454176837, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:09:24,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=25790, skipped=0, lr=[8.354574976300678e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:09:24,892] [INFO] [timer.py:259:stop] epoch=0/micro_step=25790/global_step=25790, RunningAvgSamplesPerSec=2.6368961364143857, CurrSamplesPerSec=2.608023676523187, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:09:40,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=25800, skipped=0, lr=[8.353325403369883e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:09:40,288] [INFO] [timer.py:259:stop] epoch=0/micro_step=25800/global_step=25800, RunningAvgSamplesPerSec=2.636887125402435, CurrSamplesPerSec=2.6052268614266003, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:09:55,586] [INFO] [logging.py:96:log_dist] [Rank 0] step=25810, skipped=0, lr=[8.352075449667907e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:09:55,588] [INFO] [timer.py:259:stop] epoch=0/micro_step=25810/global_step=25810, RunningAvgSamplesPerSec=2.6368838155783814, CurrSamplesPerSec=2.650556699017288, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:10:10,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=25820, skipped=0, lr=[8.35082511533668e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:10:10,978] [INFO] [timer.py:259:stop] epoch=0/micro_step=25820/global_step=25820, RunningAvgSamplesPerSec=2.6368768325540497, CurrSamplesPerSec=2.638536813905375, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:10:26,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=25830, skipped=0, lr=[8.349574400518182e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:10:26,334] [INFO] [timer.py:259:stop] epoch=0/micro_step=25830/global_step=25830, RunningAvgSamplesPerSec=2.636871322830554, CurrSamplesPerSec=2.633580313794169, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:10:41,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=25840, skipped=0, lr=[8.34832330535443e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:10:41,676] [INFO] [timer.py:259:stop] epoch=0/micro_step=25840/global_step=25840, RunningAvgSamplesPerSec=2.636865476197616, CurrSamplesPerSec=2.6326215688401375, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:10:57,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=25850, skipped=0, lr=[8.347071829987482e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:10:57,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=25850/global_step=25850, RunningAvgSamplesPerSec=2.636862804718416, CurrSamplesPerSec=2.6286885288112134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:11:12,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=25860, skipped=0, lr=[8.345819974559452e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:11:12,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=25860/global_step=25860, RunningAvgSamplesPerSec=2.6368561542890343, CurrSamplesPerSec=2.6358042293380546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:11:27,687] [INFO] [logging.py:96:log_dist] [Rank 0] step=25870, skipped=0, lr=[8.344567739212483e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:11:27,688] [INFO] [timer.py:259:stop] epoch=0/micro_step=25870/global_step=25870, RunningAvgSamplesPerSec=2.6368537559041125, CurrSamplesPerSec=2.593289760913012, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:11:43,041] [INFO] [logging.py:96:log_dist] [Rank 0] step=25880, skipped=0, lr=[8.343315124088768e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:11:43,042] [INFO] [timer.py:259:stop] epoch=0/micro_step=25880/global_step=25880, RunningAvgSamplesPerSec=2.636848587783978, CurrSamplesPerSec=2.6328157409559414, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:11:58,378] [INFO] [logging.py:96:log_dist] [Rank 0] step=25890, skipped=0, lr=[8.342062129330542e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:11:58,406] [INFO] [timer.py:259:stop] epoch=0/micro_step=25890/global_step=25890, RunningAvgSamplesPerSec=2.6368428106318875, CurrSamplesPerSec=2.557696139940695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:12:13,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=25900, skipped=0, lr=[8.34080875508008e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:12:13,750] [INFO] [timer.py:259:stop] epoch=0/micro_step=25900/global_step=25900, RunningAvgSamplesPerSec=2.6368377859909153, CurrSamplesPerSec=2.636369598610199, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:12:29,005] [INFO] [logging.py:96:log_dist] [Rank 0] step=25910, skipped=0, lr=[8.339555001479708e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:12:29,007] [INFO] [timer.py:259:stop] epoch=0/micro_step=25910/global_step=25910, RunningAvgSamplesPerSec=2.6368380380198473, CurrSamplesPerSec=2.6238017518871013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:12:44,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=25920, skipped=0, lr=[8.338300868671787e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:12:44,352] [INFO] [timer.py:259:stop] epoch=0/micro_step=25920/global_step=25920, RunningAvgSamplesPerSec=2.636832821993174, CurrSamplesPerSec=2.632439403599618, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:12:59,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=25930, skipped=0, lr=[8.337046356798727e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:12:59,621] [INFO] [timer.py:259:stop] epoch=0/micro_step=25930/global_step=25930, RunningAvgSamplesPerSec=2.6368318208483745, CurrSamplesPerSec=2.6100328537356776, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:13:14,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=25940, skipped=0, lr=[8.335791466002973e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:13:14,957] [INFO] [timer.py:259:stop] epoch=0/micro_step=25940/global_step=25940, RunningAvgSamplesPerSec=2.6368269181115767, CurrSamplesPerSec=2.635695739310272, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:13:30,242] [INFO] [logging.py:96:log_dist] [Rank 0] step=25950, skipped=0, lr=[8.334536196427025e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:13:30,244] [INFO] [timer.py:259:stop] epoch=0/micro_step=25950/global_step=25950, RunningAvgSamplesPerSec=2.6368254062228997, CurrSamplesPerSec=2.6388090558681796, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:13:45,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=25960, skipped=0, lr=[8.333280548213414e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:13:45,518] [INFO] [timer.py:259:stop] epoch=0/micro_step=25960/global_step=25960, RunningAvgSamplesPerSec=2.6368243325290286, CurrSamplesPerSec=2.637376266016662, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:14:00,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=25970, skipped=0, lr=[8.332024521504721e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:14:00,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=25970/global_step=25970, RunningAvgSamplesPerSec=2.6368197286916235, CurrSamplesPerSec=2.6054639487511086, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:14:16,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=25980, skipped=0, lr=[8.330768116443568e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:14:16,184] [INFO] [timer.py:259:stop] epoch=0/micro_step=25980/global_step=25980, RunningAvgSamplesPerSec=2.6368161558955423, CurrSamplesPerSec=2.6427820364482297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:14:31,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=25990, skipped=0, lr=[8.329511333172621e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:14:31,532] [INFO] [timer.py:259:stop] epoch=0/micro_step=25990/global_step=25990, RunningAvgSamplesPerSec=2.6368117153547135, CurrSamplesPerSec=2.628443901810048, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:14:46,840] [INFO] [logging.py:96:log_dist] [Rank 0] step=26000, skipped=0, lr=[8.32825417183459e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:14:46,842] [INFO] [timer.py:259:stop] epoch=0/micro_step=26000/global_step=26000, RunningAvgSamplesPerSec=2.6368086090866494, CurrSamplesPerSec=2.627783142371519, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:15:02,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=26010, skipped=0, lr=[8.326996632572219e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:15:02,136] [INFO] [timer.py:259:stop] epoch=0/micro_step=26010/global_step=26010, RunningAvgSamplesPerSec=2.6368065827311544, CurrSamplesPerSec=2.6463490939961263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:15:17,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=26020, skipped=0, lr=[8.32573871552831e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:15:17,447] [INFO] [timer.py:259:stop] epoch=0/micro_step=26020/global_step=26020, RunningAvgSamplesPerSec=2.63680380939885, CurrSamplesPerSec=2.619521121250023, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:15:32,797] [INFO] [logging.py:96:log_dist] [Rank 0] step=26030, skipped=0, lr=[8.324480420845696e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:15:32,801] [INFO] [timer.py:259:stop] epoch=0/micro_step=26030/global_step=26030, RunningAvgSamplesPerSec=2.636797770132534, CurrSamplesPerSec=2.628095572247736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:15:48,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=26040, skipped=0, lr=[8.323221748667256e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:15:48,072] [INFO] [timer.py:259:stop] epoch=0/micro_step=26040/global_step=26040, RunningAvgSamplesPerSec=2.636798330704374, CurrSamplesPerSec=2.6460210417034387, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:16:03,376] [INFO] [logging.py:96:log_dist] [Rank 0] step=26050, skipped=0, lr=[8.321962699135914e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:16:03,379] [INFO] [timer.py:259:stop] epoch=0/micro_step=26050/global_step=26050, RunningAvgSamplesPerSec=2.636795817732014, CurrSamplesPerSec=2.644845146218317, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:16:18,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=26060, skipped=0, lr=[8.320703272394638e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:16:18,640] [INFO] [timer.py:259:stop] epoch=0/micro_step=26060/global_step=26060, RunningAvgSamplesPerSec=2.636796546101602, CurrSamplesPerSec=2.601865759664182, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:16:33,908] [INFO] [logging.py:96:log_dist] [Rank 0] step=26070, skipped=0, lr=[8.319443468586432e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:16:33,915] [INFO] [timer.py:259:stop] epoch=0/micro_step=26070/global_step=26070, RunningAvgSamplesPerSec=2.6367941083166624, CurrSamplesPerSec=2.6382032281032197, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:16:49,176] [INFO] [logging.py:96:log_dist] [Rank 0] step=26080, skipped=0, lr=[8.31818328785435e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:16:49,177] [INFO] [timer.py:259:stop] epoch=0/micro_step=26080/global_step=26080, RunningAvgSamplesPerSec=2.636794053295336, CurrSamplesPerSec=2.5968658606716746, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:17:04,369] [INFO] [logging.py:96:log_dist] [Rank 0] step=26090, skipped=0, lr=[8.316922730341484e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:17:04,386] [INFO] [timer.py:259:stop] epoch=0/micro_step=26090/global_step=26090, RunningAvgSamplesPerSec=2.6367964212031603, CurrSamplesPerSec=2.6475338477534653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:17:19,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=26100, skipped=0, lr=[8.315661796190971e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:17:19,690] [INFO] [timer.py:259:stop] epoch=0/micro_step=26100/global_step=26100, RunningAvgSamplesPerSec=2.63679294410923, CurrSamplesPerSec=2.5829646452490795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:17:34,901] [INFO] [logging.py:96:log_dist] [Rank 0] step=26110, skipped=0, lr=[8.314400485545992e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:17:34,923] [INFO] [timer.py:259:stop] epoch=0/micro_step=26110/global_step=26110, RunningAvgSamplesPerSec=2.636794100531614, CurrSamplesPerSec=2.6491604865438876, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:17:50,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=26120, skipped=0, lr=[8.313138798549768e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:17:50,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=26120/global_step=26120, RunningAvgSamplesPerSec=2.636794721972731, CurrSamplesPerSec=2.648868121204145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:18:05,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=26130, skipped=0, lr=[8.311876735345565e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:18:05,374] [INFO] [timer.py:259:stop] epoch=0/micro_step=26130/global_step=26130, RunningAvgSamplesPerSec=2.636795953930713, CurrSamplesPerSec=2.64163355732478, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:18:20,612] [INFO] [logging.py:96:log_dist] [Rank 0] step=26140, skipped=0, lr=[8.310614296076689e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:18:20,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=26140/global_step=26140, RunningAvgSamplesPerSec=2.6367956155116077, CurrSamplesPerSec=2.644283221398417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:18:35,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=26150, skipped=0, lr=[8.30935148088649e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:18:35,902] [INFO] [timer.py:259:stop] epoch=0/micro_step=26150/global_step=26150, RunningAvgSamplesPerSec=2.636793684824517, CurrSamplesPerSec=2.6326223950437817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:18:51,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=26160, skipped=0, lr=[8.308088289918363e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:18:51,221] [INFO] [timer.py:259:stop] epoch=0/micro_step=26160/global_step=26160, RunningAvgSamplesPerSec=2.6367897262881987, CurrSamplesPerSec=2.6429614724372, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:19:06,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=26170, skipped=0, lr=[8.306824723315746e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:19:06,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=26170/global_step=26170, RunningAvgSamplesPerSec=2.636791543856223, CurrSamplesPerSec=2.646344084952715, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:19:21,732] [INFO] [logging.py:96:log_dist] [Rank 0] step=26180, skipped=0, lr=[8.30556078122211e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:19:21,734] [INFO] [timer.py:259:stop] epoch=0/micro_step=26180/global_step=26180, RunningAvgSamplesPerSec=2.6367904751621363, CurrSamplesPerSec=2.640035255675183, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:19:37,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=26190, skipped=0, lr=[8.304296463780983e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:19:37,029] [INFO] [timer.py:259:stop] epoch=0/micro_step=26190/global_step=26190, RunningAvgSamplesPerSec=2.636786925617833, CurrSamplesPerSec=2.6110585152627364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:19:52,279] [INFO] [logging.py:96:log_dist] [Rank 0] step=26200, skipped=0, lr=[8.303031771135927e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:19:52,294] [INFO] [timer.py:259:stop] epoch=0/micro_step=26200/global_step=26200, RunningAvgSamplesPerSec=2.6367858372338997, CurrSamplesPerSec=2.6235395718629455, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:20:07,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=26210, skipped=0, lr=[8.301766703430547e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:20:07,549] [INFO] [timer.py:259:stop] epoch=0/micro_step=26210/global_step=26210, RunningAvgSamplesPerSec=2.6367843073318746, CurrSamplesPerSec=2.6388339588518956, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:20:22,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=26220, skipped=0, lr=[8.30050126080849e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:20:22,806] [INFO] [timer.py:259:stop] epoch=0/micro_step=26220/global_step=26220, RunningAvgSamplesPerSec=2.6367829702812715, CurrSamplesPerSec=2.6416298139169547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:20:38,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=26230, skipped=0, lr=[8.299235443413453e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:20:38,051] [INFO] [timer.py:259:stop] epoch=0/micro_step=26230/global_step=26230, RunningAvgSamplesPerSec=2.636782754598581, CurrSamplesPerSec=2.6376644411792425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:20:53,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=26240, skipped=0, lr=[8.297969251389165e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:20:53,325] [INFO] [timer.py:259:stop] epoch=0/micro_step=26240/global_step=26240, RunningAvgSamplesPerSec=2.63678082138176, CurrSamplesPerSec=2.624413298577185, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:21:08,594] [INFO] [logging.py:96:log_dist] [Rank 0] step=26250, skipped=0, lr=[8.296702684879405e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:21:08,600] [INFO] [timer.py:259:stop] epoch=0/micro_step=26250/global_step=26250, RunningAvgSamplesPerSec=2.6367781914415263, CurrSamplesPerSec=2.587793351411006, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:21:23,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=26260, skipped=0, lr=[8.29543574402799e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:21:23,871] [INFO] [timer.py:259:stop] epoch=0/micro_step=26260/global_step=26260, RunningAvgSamplesPerSec=2.636777160469787, CurrSamplesPerSec=2.6365547942355247, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:21:39,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=26270, skipped=0, lr=[8.294168428978784e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:21:39,142] [INFO] [timer.py:259:stop] epoch=0/micro_step=26270/global_step=26270, RunningAvgSamplesPerSec=2.636775868763455, CurrSamplesPerSec=2.603361204497472, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:21:54,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=26280, skipped=0, lr=[8.292900739875688e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:21:54,347] [INFO] [timer.py:259:stop] epoch=0/micro_step=26280/global_step=26280, RunningAvgSamplesPerSec=2.636778494671986, CurrSamplesPerSec=2.6290419593227905, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:22:09,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=26290, skipped=0, lr=[8.291632676862652e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:22:09,609] [INFO] [timer.py:259:stop] epoch=0/micro_step=26290/global_step=26290, RunningAvgSamplesPerSec=2.6367767652762133, CurrSamplesPerSec=2.559688243320338, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:22:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=26300, skipped=0, lr=[8.290364240083663e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:22:24,822] [INFO] [timer.py:259:stop] epoch=0/micro_step=26300/global_step=26300, RunningAvgSamplesPerSec=2.636778839031408, CurrSamplesPerSec=2.6393196609987335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:22:40,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=26310, skipped=0, lr=[8.289095429682753e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:22:40,081] [INFO] [timer.py:259:stop] epoch=0/micro_step=26310/global_step=26310, RunningAvgSamplesPerSec=2.6367785092145324, CurrSamplesPerSec=2.614934861558878, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:22:55,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=26320, skipped=0, lr=[8.287826245803994e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:22:55,336] [INFO] [timer.py:259:stop] epoch=0/micro_step=26320/global_step=26320, RunningAvgSamplesPerSec=2.636779115048815, CurrSamplesPerSec=2.6414800863031727, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:23:10,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=26330, skipped=0, lr=[8.286556688591505e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:23:10,592] [INFO] [timer.py:259:stop] epoch=0/micro_step=26330/global_step=26330, RunningAvgSamplesPerSec=2.636778440069607, CurrSamplesPerSec=2.6647754128781918, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:23:25,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=26340, skipped=0, lr=[8.285286758189444e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:23:25,833] [INFO] [timer.py:259:stop] epoch=0/micro_step=26340/global_step=26340, RunningAvgSamplesPerSec=2.6367804893201234, CurrSamplesPerSec=2.6414023178239807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:23:41,057] [INFO] [logging.py:96:log_dist] [Rank 0] step=26350, skipped=0, lr=[8.28401645474201e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:23:41,076] [INFO] [timer.py:259:stop] epoch=0/micro_step=26350/global_step=26350, RunningAvgSamplesPerSec=2.636782063455767, CurrSamplesPerSec=2.6591984598984966, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:23:56,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=26360, skipped=0, lr=[8.282745778393449e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:23:56,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=26360/global_step=26360, RunningAvgSamplesPerSec=2.6367811482182395, CurrSamplesPerSec=2.636431327594479, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:24:11,628] [INFO] [logging.py:96:log_dist] [Rank 0] step=26370, skipped=0, lr=[8.281474729288045e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:24:11,641] [INFO] [timer.py:259:stop] epoch=0/micro_step=26370/global_step=26370, RunningAvgSamplesPerSec=2.636779237544316, CurrSamplesPerSec=2.6339176935578497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:24:26,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=26380, skipped=0, lr=[8.280203307570125e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:24:26,851] [INFO] [timer.py:259:stop] epoch=0/micro_step=26380/global_step=26380, RunningAvgSamplesPerSec=2.636781769826549, CurrSamplesPerSec=2.6613451260256125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:24:42,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=26390, skipped=0, lr=[8.278931513384063e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:24:42,119] [INFO] [timer.py:259:stop] epoch=0/micro_step=26390/global_step=26390, RunningAvgSamplesPerSec=2.636780453582473, CurrSamplesPerSec=2.6558363195017565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:24:57,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=26400, skipped=0, lr=[8.27765934687427e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:24:57,329] [INFO] [timer.py:259:stop] epoch=0/micro_step=26400/global_step=26400, RunningAvgSamplesPerSec=2.6367825709473114, CurrSamplesPerSec=2.6024469379308246, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:25:12,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=26410, skipped=0, lr=[8.276386808185198e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:25:12,589] [INFO] [timer.py:259:stop] epoch=0/micro_step=26410/global_step=26410, RunningAvgSamplesPerSec=2.636782659579402, CurrSamplesPerSec=2.6429456511153475, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:25:27,788] [INFO] [logging.py:96:log_dist] [Rank 0] step=26420, skipped=0, lr=[8.275113897461349e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:25:27,789] [INFO] [timer.py:259:stop] epoch=0/micro_step=26420/global_step=26420, RunningAvgSamplesPerSec=2.636785672898815, CurrSamplesPerSec=2.643919431585249, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:25:43,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=26430, skipped=0, lr=[8.273840614847257e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:25:43,057] [INFO] [timer.py:259:stop] epoch=0/micro_step=26430/global_step=26430, RunningAvgSamplesPerSec=2.63678472831236, CurrSamplesPerSec=2.659592184705663, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:25:58,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=26440, skipped=0, lr=[8.27256696048751e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:25:58,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=26440/global_step=26440, RunningAvgSamplesPerSec=2.636785378360126, CurrSamplesPerSec=2.629065442338782, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:26:13,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=26450, skipped=0, lr=[8.271292934526727e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:26:13,565] [INFO] [timer.py:259:stop] epoch=0/micro_step=26450/global_step=26450, RunningAvgSamplesPerSec=2.63678637234929, CurrSamplesPerSec=2.6682722664427407, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:26:28,754] [INFO] [logging.py:96:log_dist] [Rank 0] step=26460, skipped=0, lr=[8.270018537109575e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:26:28,755] [INFO] [timer.py:259:stop] epoch=0/micro_step=26460/global_step=26460, RunningAvgSamplesPerSec=2.6367900617935764, CurrSamplesPerSec=2.6491274405882645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:26:43,966] [INFO] [logging.py:96:log_dist] [Rank 0] step=26470, skipped=0, lr=[8.268743768380763e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:26:43,967] [INFO] [timer.py:259:stop] epoch=0/micro_step=26470/global_step=26470, RunningAvgSamplesPerSec=2.6367923418116392, CurrSamplesPerSec=2.6357744143946733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:26:59,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=26480, skipped=0, lr=[8.267468628485043e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:26:59,166] [INFO] [timer.py:259:stop] epoch=0/micro_step=26480/global_step=26480, RunningAvgSamplesPerSec=2.636794729560261, CurrSamplesPerSec=2.6004187856924443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:27:14,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=26490, skipped=0, lr=[8.266193117567202e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:27:14,475] [INFO] [timer.py:259:stop] epoch=0/micro_step=26490/global_step=26490, RunningAvgSamplesPerSec=2.6367910057935133, CurrSamplesPerSec=2.6384123317254797, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:27:29,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=26500, skipped=0, lr=[8.264917235772083e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:27:29,749] [INFO] [timer.py:259:stop] epoch=0/micro_step=26500/global_step=26500, RunningAvgSamplesPerSec=2.6367894324533903, CurrSamplesPerSec=2.608882229239033, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:27:45,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=26510, skipped=0, lr=[8.263640983244557e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:27:45,003] [INFO] [timer.py:259:stop] epoch=0/micro_step=26510/global_step=26510, RunningAvgSamplesPerSec=2.6367890731265398, CurrSamplesPerSec=2.636570539135181, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:28:00,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=26520, skipped=0, lr=[8.262364360129543e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:28:00,193] [INFO] [timer.py:259:stop] epoch=0/micro_step=26520/global_step=26520, RunningAvgSamplesPerSec=2.6367929243652037, CurrSamplesPerSec=2.6428644657331297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:28:15,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=26530, skipped=0, lr=[8.261087366572001e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:28:15,420] [INFO] [timer.py:259:stop] epoch=0/micro_step=26530/global_step=26530, RunningAvgSamplesPerSec=2.636794884827794, CurrSamplesPerSec=2.6170673734482905, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:28:30,703] [INFO] [logging.py:96:log_dist] [Rank 0] step=26540, skipped=0, lr=[8.259810002716939e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:28:30,705] [INFO] [timer.py:259:stop] epoch=0/micro_step=26540/global_step=26540, RunningAvgSamplesPerSec=2.6367932880009377, CurrSamplesPerSec=2.629313893624845, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:28:45,984] [INFO] [logging.py:96:log_dist] [Rank 0] step=26550, skipped=0, lr=[8.2585322687094e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:28:45,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=26550/global_step=26550, RunningAvgSamplesPerSec=2.63679141062044, CurrSamplesPerSec=2.6238267827670674, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:29:01,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=26560, skipped=0, lr=[8.257254164694468e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:29:01,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=26560/global_step=26560, RunningAvgSamplesPerSec=2.6367909251287136, CurrSamplesPerSec=2.6531265413755443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:29:16,488] [INFO] [logging.py:96:log_dist] [Rank 0] step=26570, skipped=0, lr=[8.255975690817275e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:29:16,494] [INFO] [timer.py:259:stop] epoch=0/micro_step=26570/global_step=26570, RunningAvgSamplesPerSec=2.6367924729393875, CurrSamplesPerSec=2.6430622337208374, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:29:31,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=26580, skipped=0, lr=[8.25469684722299e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:29:31,764] [INFO] [timer.py:259:stop] epoch=0/micro_step=26580/global_step=26580, RunningAvgSamplesPerSec=2.6367914954454656, CurrSamplesPerSec=2.6313968792082574, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:29:47,007] [INFO] [logging.py:96:log_dist] [Rank 0] step=26590, skipped=0, lr=[8.253417634056832e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:29:47,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=26590/global_step=26590, RunningAvgSamplesPerSec=2.636791906282729, CurrSamplesPerSec=2.6540578760225775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:30:02,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=26600, skipped=0, lr=[8.252138051464049e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:30:02,233] [INFO] [timer.py:259:stop] epoch=0/micro_step=26600/global_step=26600, RunningAvgSamplesPerSec=2.6367948568962243, CurrSamplesPerSec=2.6371262887943634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:30:17,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=26610, skipped=0, lr=[8.250858099589942e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:30:17,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=26610/global_step=26610, RunningAvgSamplesPerSec=2.63679268057187, CurrSamplesPerSec=2.6356464662546464, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:30:32,703] [INFO] [logging.py:96:log_dist] [Rank 0] step=26620, skipped=0, lr=[8.249577778579848e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:30:32,707] [INFO] [timer.py:259:stop] epoch=0/micro_step=26620/global_step=26620, RunningAvgSamplesPerSec=2.6367958656688484, CurrSamplesPerSec=2.64832246158393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:30:47,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=26630, skipped=0, lr=[8.248297088579147e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:30:47,904] [INFO] [timer.py:259:stop] epoch=0/micro_step=26630/global_step=26630, RunningAvgSamplesPerSec=2.636798655771395, CurrSamplesPerSec=2.6598468611183828, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:31:03,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=26640, skipped=0, lr=[8.247016029733265e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:31:03,172] [INFO] [timer.py:259:stop] epoch=0/micro_step=26640/global_step=26640, RunningAvgSamplesPerSec=2.6367975921532345, CurrSamplesPerSec=2.657279141344153, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:31:18,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=26650, skipped=0, lr=[8.245734602187665e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:31:18,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=26650/global_step=26650, RunningAvgSamplesPerSec=2.6368007196892576, CurrSamplesPerSec=2.632213074660646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:31:33,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=26660, skipped=0, lr=[8.244452806087853e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:31:33,609] [INFO] [timer.py:259:stop] epoch=0/micro_step=26660/global_step=26660, RunningAvgSamplesPerSec=2.636803243053088, CurrSamplesPerSec=2.665795846233558, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:31:48,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=26670, skipped=0, lr=[8.243170641579379e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:31:48,878] [INFO] [timer.py:259:stop] epoch=0/micro_step=26670/global_step=26670, RunningAvgSamplesPerSec=2.6368012942750614, CurrSamplesPerSec=2.6085593438689902, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:32:04,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=26680, skipped=0, lr=[8.24188810880783e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:32:04,071] [INFO] [timer.py:259:stop] epoch=0/micro_step=26680/global_step=26680, RunningAvgSamplesPerSec=2.6368040806356565, CurrSamplesPerSec=2.6606124488853005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:32:19,365] [INFO] [logging.py:96:log_dist] [Rank 0] step=26690, skipped=0, lr=[8.240605207918842e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:32:19,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=26690/global_step=26690, RunningAvgSamplesPerSec=2.636800374847148, CurrSamplesPerSec=2.572381393256361, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:32:34,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=26700, skipped=0, lr=[8.239321939058085e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:32:34,590] [INFO] [timer.py:259:stop] epoch=0/micro_step=26700/global_step=26700, RunningAvgSamplesPerSec=2.6368018727558113, CurrSamplesPerSec=2.6597456593531783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:32:49,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=26710, skipped=0, lr=[8.238038302371279e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:32:49,866] [INFO] [timer.py:259:stop] epoch=0/micro_step=26710/global_step=26710, RunningAvgSamplesPerSec=2.636799973154085, CurrSamplesPerSec=2.62869758994498, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:33:05,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=26720, skipped=0, lr=[8.236754298004175e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:33:05,126] [INFO] [timer.py:259:stop] epoch=0/micro_step=26720/global_step=26720, RunningAvgSamplesPerSec=2.636799189153809, CurrSamplesPerSec=2.642392856507405, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:33:20,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=26730, skipped=0, lr=[8.235469926102578e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:33:20,445] [INFO] [timer.py:259:stop] epoch=0/micro_step=26730/global_step=26730, RunningAvgSamplesPerSec=2.636795691402838, CurrSamplesPerSec=2.6412085403598335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:33:35,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=26740, skipped=0, lr=[8.234185186812328e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:33:35,696] [INFO] [timer.py:259:stop] epoch=0/micro_step=26740/global_step=26740, RunningAvgSamplesPerSec=2.636794680223115, CurrSamplesPerSec=2.6578541827703654, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:33:50,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=26750, skipped=0, lr=[8.232900080279305e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:33:50,929] [INFO] [timer.py:259:stop] epoch=0/micro_step=26750/global_step=26750, RunningAvgSamplesPerSec=2.6367960009864144, CurrSamplesPerSec=2.6480244298665703, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:34:06,120] [INFO] [logging.py:96:log_dist] [Rank 0] step=26760, skipped=0, lr=[8.231614606649434e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:34:06,122] [INFO] [timer.py:259:stop] epoch=0/micro_step=26760/global_step=26760, RunningAvgSamplesPerSec=2.636799353307537, CurrSamplesPerSec=2.6618889860080595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:34:21,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=26770, skipped=0, lr=[8.230328766068683e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:34:21,397] [INFO] [timer.py:259:stop] epoch=0/micro_step=26770/global_step=26770, RunningAvgSamplesPerSec=2.636798427689721, CurrSamplesPerSec=2.6595972440270983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:34:36,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=26780, skipped=0, lr=[8.229042558683057e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:34:36,621] [INFO] [timer.py:259:stop] epoch=0/micro_step=26780/global_step=26780, RunningAvgSamplesPerSec=2.6367997216673436, CurrSamplesPerSec=2.6282989591447867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:34:51,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=26790, skipped=0, lr=[8.227755984638608e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:34:51,950] [INFO] [timer.py:259:stop] epoch=0/micro_step=26790/global_step=26790, RunningAvgSamplesPerSec=2.636795514542737, CurrSamplesPerSec=2.6430043574989943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:35:07,089] [INFO] [logging.py:96:log_dist] [Rank 0] step=26800, skipped=0, lr=[8.226469044081423e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:35:07,100] [INFO] [timer.py:259:stop] epoch=0/micro_step=26800/global_step=26800, RunningAvgSamplesPerSec=2.636801860968175, CurrSamplesPerSec=2.676736698279529, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:35:22,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=26810, skipped=0, lr=[8.225181737157638e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:35:22,284] [INFO] [timer.py:259:stop] epoch=0/micro_step=26810/global_step=26810, RunningAvgSamplesPerSec=2.6368065177393243, CurrSamplesPerSec=2.6459271485775027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:35:37,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=26820, skipped=0, lr=[8.223894064013425e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:35:37,510] [INFO] [timer.py:259:stop] epoch=0/micro_step=26820/global_step=26820, RunningAvgSamplesPerSec=2.636808127426412, CurrSamplesPerSec=2.6391174709485963, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:35:52,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=26830, skipped=0, lr=[8.222606024795002e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:35:52,806] [INFO] [timer.py:259:stop] epoch=0/micro_step=26830/global_step=26830, RunningAvgSamplesPerSec=2.6368053610786495, CurrSamplesPerSec=2.641342019213943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:36:08,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=26840, skipped=0, lr=[8.221317619648625e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:36:08,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=26840/global_step=26840, RunningAvgSamplesPerSec=2.636807413225724, CurrSamplesPerSec=2.6448226312849608, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:36:23,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=26850, skipped=0, lr=[8.220028848720592e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:36:23,261] [INFO] [timer.py:259:stop] epoch=0/micro_step=26850/global_step=26850, RunningAvgSamplesPerSec=2.6368084834969645, CurrSamplesPerSec=2.6526604899978983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:36:38,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=26860, skipped=0, lr=[8.218739712157245e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:36:38,488] [INFO] [timer.py:259:stop] epoch=0/micro_step=26860/global_step=26860, RunningAvgSamplesPerSec=2.6368100046988023, CurrSamplesPerSec=2.6480302811761236, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:36:53,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=26870, skipped=0, lr=[8.217450210104963e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:36:53,699] [INFO] [timer.py:259:stop] epoch=0/micro_step=26870/global_step=26870, RunningAvgSamplesPerSec=2.6368125157605076, CurrSamplesPerSec=2.6548730677999925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:37:08,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=26880, skipped=0, lr=[8.216160342710172e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:37:08,992] [INFO] [timer.py:259:stop] epoch=0/micro_step=26880/global_step=26880, RunningAvgSamplesPerSec=2.636810296294148, CurrSamplesPerSec=2.6138548406277207, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:37:24,187] [INFO] [logging.py:96:log_dist] [Rank 0] step=26890, skipped=0, lr=[8.214870110119338e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:37:24,189] [INFO] [timer.py:259:stop] epoch=0/micro_step=26890/global_step=26890, RunningAvgSamplesPerSec=2.636813085121651, CurrSamplesPerSec=2.6209873905726693, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:37:39,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=26900, skipped=0, lr=[8.213579512478965e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:37:39,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=26900/global_step=26900, RunningAvgSamplesPerSec=2.6368160592004424, CurrSamplesPerSec=2.6247068601338728, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:37:54,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=26910, skipped=0, lr=[8.2122885499356e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:37:54,653] [INFO] [timer.py:259:stop] epoch=0/micro_step=26910/global_step=26910, RunningAvgSamplesPerSec=2.6368157568715485, CurrSamplesPerSec=2.6171155460288635, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:38:09,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=26920, skipped=0, lr=[8.210997222635832e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:38:09,888] [INFO] [timer.py:259:stop] epoch=0/micro_step=26920/global_step=26920, RunningAvgSamplesPerSec=2.636816133020948, CurrSamplesPerSec=2.625668066392881, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:38:25,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=26930, skipped=0, lr=[8.209705530726297e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:38:25,105] [INFO] [timer.py:259:stop] epoch=0/micro_step=26930/global_step=26930, RunningAvgSamplesPerSec=2.636818513965845, CurrSamplesPerSec=2.6393013920414505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:38:40,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=26940, skipped=0, lr=[8.20841347435366e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:38:40,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=26940/global_step=26940, RunningAvgSamplesPerSec=2.6368155875801063, CurrSamplesPerSec=2.616952256328516, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:38:55,703] [INFO] [logging.py:96:log_dist] [Rank 0] step=26950, skipped=0, lr=[8.20712105366464e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:38:55,705] [INFO] [timer.py:259:stop] epoch=0/micro_step=26950/global_step=26950, RunningAvgSamplesPerSec=2.636813351434811, CurrSamplesPerSec=2.620757704443858, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:39:10,969] [INFO] [logging.py:96:log_dist] [Rank 0] step=26960, skipped=0, lr=[8.205828268805989e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:39:10,993] [INFO] [timer.py:259:stop] epoch=0/micro_step=26960/global_step=26960, RunningAvgSamplesPerSec=2.636811863159289, CurrSamplesPerSec=2.6432467050678397, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:39:26,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=26970, skipped=0, lr=[8.204535119924502e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:39:26,246] [INFO] [timer.py:259:stop] epoch=0/micro_step=26970/global_step=26970, RunningAvgSamplesPerSec=2.636811321874626, CurrSamplesPerSec=2.627771618052414, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:39:41,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=26980, skipped=0, lr=[8.203241607167018e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:39:41,653] [INFO] [timer.py:259:stop] epoch=0/micro_step=26980/global_step=26980, RunningAvgSamplesPerSec=2.636802410194132, CurrSamplesPerSec=2.612961269625759, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:39:56,912] [INFO] [logging.py:96:log_dist] [Rank 0] step=26990, skipped=0, lr=[8.201947730680418e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:39:56,932] [INFO] [timer.py:259:stop] epoch=0/micro_step=26990/global_step=26990, RunningAvgSamplesPerSec=2.6368018442405035, CurrSamplesPerSec=2.653932764683208, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:40:12,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=27000, skipped=0, lr=[8.200653490611618e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:40:12,328] [INFO] [timer.py:259:stop] epoch=0/micro_step=27000/global_step=27000, RunningAvgSamplesPerSec=2.6367966473358493, CurrSamplesPerSec=2.6250464880801028, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:40:27,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=27010, skipped=0, lr=[8.199358887107582e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:40:27,669] [INFO] [timer.py:259:stop] epoch=0/micro_step=27010/global_step=27010, RunningAvgSamplesPerSec=2.636792016489859, CurrSamplesPerSec=2.6212920631621355, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:40:43,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=27020, skipped=0, lr=[8.198063920315311e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:40:43,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=27020/global_step=27020, RunningAvgSamplesPerSec=2.6367867440208417, CurrSamplesPerSec=2.627185245973444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:40:58,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=27030, skipped=0, lr=[8.196768590381849e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:40:58,372] [INFO] [timer.py:259:stop] epoch=0/micro_step=27030/global_step=27030, RunningAvgSamplesPerSec=2.6367815535017374, CurrSamplesPerSec=2.5824139970715416, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:41:13,699] [INFO] [logging.py:96:log_dist] [Rank 0] step=27040, skipped=0, lr=[8.195472897454284e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:41:13,718] [INFO] [timer.py:259:stop] epoch=0/micro_step=27040/global_step=27040, RunningAvgSamplesPerSec=2.6367783708720505, CurrSamplesPerSec=2.657090602144186, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:41:28,996] [INFO] [logging.py:96:log_dist] [Rank 0] step=27050, skipped=0, lr=[8.19417684167974e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:41:28,997] [INFO] [timer.py:259:stop] epoch=0/micro_step=27050/global_step=27050, RunningAvgSamplesPerSec=2.636776404528685, CurrSamplesPerSec=2.561202441751311, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:41:44,325] [INFO] [logging.py:96:log_dist] [Rank 0] step=27060, skipped=0, lr=[8.192880423205385e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:41:44,327] [INFO] [timer.py:259:stop] epoch=0/micro_step=27060/global_step=27060, RunningAvgSamplesPerSec=2.6367724258118663, CurrSamplesPerSec=2.629177095399238, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:41:59,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=27070, skipped=0, lr=[8.191583642178425e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:41:59,753] [INFO] [timer.py:259:stop] epoch=0/micro_step=27070/global_step=27070, RunningAvgSamplesPerSec=2.6367625062512268, CurrSamplesPerSec=2.5920885684053885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:42:15,084] [INFO] [logging.py:96:log_dist] [Rank 0] step=27080, skipped=0, lr=[8.190286498746115e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:42:15,087] [INFO] [timer.py:259:stop] epoch=0/micro_step=27080/global_step=27080, RunningAvgSamplesPerSec=2.636758435314809, CurrSamplesPerSec=2.6241509963288583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:42:30,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=27090, skipped=0, lr=[8.188988993055743e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:42:30,392] [INFO] [timer.py:259:stop] epoch=0/micro_step=27090/global_step=27090, RunningAvgSamplesPerSec=2.636755942599958, CurrSamplesPerSec=2.639279386587791, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:42:45,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=27100, skipped=0, lr=[8.187691125254641e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:42:45,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=27100/global_step=27100, RunningAvgSamplesPerSec=2.6367547899263233, CurrSamplesPerSec=2.625951633426289, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:43:01,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=27110, skipped=0, lr=[8.186392895490184e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:43:01,055] [INFO] [timer.py:259:stop] epoch=0/micro_step=27110/global_step=27110, RunningAvgSamplesPerSec=2.636748140534931, CurrSamplesPerSec=2.6219430057120316, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:43:16,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=27120, skipped=0, lr=[8.185094303909785e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:43:16,380] [INFO] [timer.py:259:stop] epoch=0/micro_step=27120/global_step=27120, RunningAvgSamplesPerSec=2.636743429276975, CurrSamplesPerSec=2.63174154340429, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:43:31,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=27130, skipped=0, lr=[8.1837953506609e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:43:31,699] [INFO] [timer.py:259:stop] epoch=0/micro_step=27130/global_step=27130, RunningAvgSamplesPerSec=2.636740990661796, CurrSamplesPerSec=2.6335154111803, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:43:47,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=27140, skipped=0, lr=[8.182496035891026e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:43:47,063] [INFO] [timer.py:259:stop] epoch=0/micro_step=27140/global_step=27140, RunningAvgSamplesPerSec=2.636736443548798, CurrSamplesPerSec=2.6187675500458814, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:44:02,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=27150, skipped=0, lr=[8.1811963597477e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:44:02,338] [INFO] [timer.py:259:stop] epoch=0/micro_step=27150/global_step=27150, RunningAvgSamplesPerSec=2.636735958376712, CurrSamplesPerSec=2.660397702397671, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:44:17,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=27160, skipped=0, lr=[8.1798963223785e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:44:17,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=27160/global_step=27160, RunningAvgSamplesPerSec=2.6367344017203136, CurrSamplesPerSec=2.6444015892657613, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:44:32,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=27170, skipped=0, lr=[8.178595923931045e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:44:32,933] [INFO] [timer.py:259:stop] epoch=0/micro_step=27170/global_step=27170, RunningAvgSamplesPerSec=2.6367304591377807, CurrSamplesPerSec=2.6297458079361706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:44:48,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=27180, skipped=0, lr=[8.177295164553e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:44:48,255] [INFO] [timer.py:259:stop] epoch=0/micro_step=27180/global_step=27180, RunningAvgSamplesPerSec=2.636726813741143, CurrSamplesPerSec=2.6133902702827707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:45:03,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=27190, skipped=0, lr=[8.175994044392063e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:45:03,618] [INFO] [timer.py:259:stop] epoch=0/micro_step=27190/global_step=27190, RunningAvgSamplesPerSec=2.6367228500318958, CurrSamplesPerSec=2.640590804642332, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:45:18,985] [INFO] [logging.py:96:log_dist] [Rank 0] step=27200, skipped=0, lr=[8.174692563595977e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:45:18,987] [INFO] [timer.py:259:stop] epoch=0/micro_step=27200/global_step=27200, RunningAvgSamplesPerSec=2.6367182492919548, CurrSamplesPerSec=2.5882584481156314, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:45:34,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=27210, skipped=0, lr=[8.173390722312524e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:45:34,286] [INFO] [timer.py:259:stop] epoch=0/micro_step=27210/global_step=27210, RunningAvgSamplesPerSec=2.636716572765677, CurrSamplesPerSec=2.6351211395641974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:45:49,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=27220, skipped=0, lr=[8.172088520689533e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:45:49,590] [INFO] [timer.py:259:stop] epoch=0/micro_step=27220/global_step=27220, RunningAvgSamplesPerSec=2.6367144753520346, CurrSamplesPerSec=2.6124571500539124, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:46:04,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=27230, skipped=0, lr=[8.170785958874865e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:46:04,833] [INFO] [timer.py:259:stop] epoch=0/micro_step=27230/global_step=27230, RunningAvgSamplesPerSec=2.6367150586762618, CurrSamplesPerSec=2.630383635499695, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:46:20,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=27240, skipped=0, lr=[8.16948303701643e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:46:20,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=27240/global_step=27240, RunningAvgSamplesPerSec=2.6367097940625945, CurrSamplesPerSec=2.650025412415781, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:46:35,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=27250, skipped=0, lr=[8.16817975526217e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:46:35,435] [INFO] [timer.py:259:stop] epoch=0/micro_step=27250/global_step=27250, RunningAvgSamplesPerSec=2.6367101949931597, CurrSamplesPerSec=2.6338072914002923, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:46:50,770] [INFO] [logging.py:96:log_dist] [Rank 0] step=27260, skipped=0, lr=[8.16687611376008e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:46:50,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=27260/global_step=27260, RunningAvgSamplesPerSec=2.6367055377834125, CurrSamplesPerSec=2.638693678177443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:47:06,114] [INFO] [logging.py:96:log_dist] [Rank 0] step=27270, skipped=0, lr=[8.165572112658184e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:47:06,116] [INFO] [timer.py:259:stop] epoch=0/micro_step=27270/global_step=27270, RunningAvgSamplesPerSec=2.636700316319145, CurrSamplesPerSec=2.6468208638985056, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:47:21,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=27280, skipped=0, lr=[8.164267752104552e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:47:21,485] [INFO] [timer.py:259:stop] epoch=0/micro_step=27280/global_step=27280, RunningAvgSamplesPerSec=2.63669402342199, CurrSamplesPerSec=2.625617113011997, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:47:36,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=27290, skipped=0, lr=[8.162963032247298e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:47:36,783] [INFO] [timer.py:259:stop] epoch=0/micro_step=27290/global_step=27290, RunningAvgSamplesPerSec=2.6366916432834215, CurrSamplesPerSec=2.602205959603427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:47:52,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=27300, skipped=0, lr=[8.16165795323457e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:47:52,022] [INFO] [timer.py:259:stop] epoch=0/micro_step=27300/global_step=27300, RunningAvgSamplesPerSec=2.6366933536953048, CurrSamplesPerSec=2.651517658640493, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:48:07,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=27310, skipped=0, lr=[8.160352515214559e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:48:07,290] [INFO] [timer.py:259:stop] epoch=0/micro_step=27310/global_step=27310, RunningAvgSamplesPerSec=2.6366938376653097, CurrSamplesPerSec=2.633897845296576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:48:22,660] [INFO] [logging.py:96:log_dist] [Rank 0] step=27320, skipped=0, lr=[8.159046718335502e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:48:22,663] [INFO] [timer.py:259:stop] epoch=0/micro_step=27320/global_step=27320, RunningAvgSamplesPerSec=2.6366875767564246, CurrSamplesPerSec=2.6135620728479885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:48:37,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=27330, skipped=0, lr=[8.15774056274567e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:48:37,931] [INFO] [timer.py:259:stop] epoch=0/micro_step=27330/global_step=27330, RunningAvgSamplesPerSec=2.6366870080685363, CurrSamplesPerSec=2.6527787701609427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:48:53,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=27340, skipped=0, lr=[8.156434048593379e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:48:53,193] [INFO] [timer.py:259:stop] epoch=0/micro_step=27340/global_step=27340, RunningAvgSamplesPerSec=2.63668697576965, CurrSamplesPerSec=2.5924794948849317, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:49:08,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=27350, skipped=0, lr=[8.155127176026982e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:49:08,435] [INFO] [timer.py:259:stop] epoch=0/micro_step=27350/global_step=27350, RunningAvgSamplesPerSec=2.6366884001081354, CurrSamplesPerSec=2.6054429085407675, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:49:23,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=27360, skipped=0, lr=[8.153819945194876e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:49:23,823] [INFO] [timer.py:259:stop] epoch=0/micro_step=27360/global_step=27360, RunningAvgSamplesPerSec=2.636681070121082, CurrSamplesPerSec=2.63302027234682, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:49:39,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=27370, skipped=0, lr=[8.1525123562455e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:49:39,138] [INFO] [timer.py:259:stop] epoch=0/micro_step=27370/global_step=27370, RunningAvgSamplesPerSec=2.6366776049358913, CurrSamplesPerSec=2.481362733516616, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:49:54,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=27380, skipped=0, lr=[8.151204409327325e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:49:54,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=27380/global_step=27380, RunningAvgSamplesPerSec=2.63667872359081, CurrSamplesPerSec=2.659225856673067, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:50:09,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=27390, skipped=0, lr=[8.149896104588874e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:50:09,826] [INFO] [timer.py:259:stop] epoch=0/micro_step=27390/global_step=27390, RunningAvgSamplesPerSec=2.636666616434925, CurrSamplesPerSec=2.5829423762185706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:50:25,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=27400, skipped=0, lr=[8.148587442178703e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:50:25,134] [INFO] [timer.py:259:stop] epoch=0/micro_step=27400/global_step=27400, RunningAvgSamplesPerSec=2.636663549622408, CurrSamplesPerSec=2.6271992335580467, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:50:40,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=27410, skipped=0, lr=[8.147278422245413e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:50:40,492] [INFO] [timer.py:259:stop] epoch=0/micro_step=27410/global_step=27410, RunningAvgSamplesPerSec=2.636657205096842, CurrSamplesPerSec=2.6090777841125505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:50:55,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=27420, skipped=0, lr=[8.145969044937642e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:50:55,720] [INFO] [timer.py:259:stop] epoch=0/micro_step=27420/global_step=27420, RunningAvgSamplesPerSec=2.6366591833999578, CurrSamplesPerSec=2.618548061485478, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:51:11,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=27430, skipped=0, lr=[8.144659310404071e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:51:11,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=27430/global_step=27430, RunningAvgSamplesPerSec=2.636655492731332, CurrSamplesPerSec=2.6443895019202657, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:51:26,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=27440, skipped=0, lr=[8.14334921879342e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:51:26,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=27440/global_step=27440, RunningAvgSamplesPerSec=2.6366534469358816, CurrSamplesPerSec=2.635965324518938, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:51:41,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=27450, skipped=0, lr=[8.14203877025445e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:51:41,672] [INFO] [timer.py:259:stop] epoch=0/micro_step=27450/global_step=27450, RunningAvgSamplesPerSec=2.6366500804878585, CurrSamplesPerSec=2.6295764047822954, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:51:56,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=27460, skipped=0, lr=[8.140727964935965e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:51:56,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=27460/global_step=27460, RunningAvgSamplesPerSec=2.6366498835707346, CurrSamplesPerSec=2.6495495705425296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:52:12,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=27470, skipped=0, lr=[8.139416802986806e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:52:12,240] [INFO] [timer.py:259:stop] epoch=0/micro_step=27470/global_step=27470, RunningAvgSamplesPerSec=2.6366470704113896, CurrSamplesPerSec=2.63405912107862, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:52:27,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=27480, skipped=0, lr=[8.138105284555855e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:52:27,468] [INFO] [timer.py:259:stop] epoch=0/micro_step=27480/global_step=27480, RunningAvgSamplesPerSec=2.636649222738822, CurrSamplesPerSec=2.6528978998513986, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:52:42,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=27490, skipped=0, lr=[8.136793409792037e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:52:42,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=27490/global_step=27490, RunningAvgSamplesPerSec=2.636642721292329, CurrSamplesPerSec=2.6323910782773465, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:52:58,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=27500, skipped=0, lr=[8.135481178844315e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:52:58,132] [INFO] [timer.py:259:stop] epoch=0/micro_step=27500/global_step=27500, RunningAvgSamplesPerSec=2.63664012449988, CurrSamplesPerSec=2.6227107056711105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:53:13,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=27510, skipped=0, lr=[8.134168591861693e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:53:13,410] [INFO] [timer.py:259:stop] epoch=0/micro_step=27510/global_step=27510, RunningAvgSamplesPerSec=2.636638890440564, CurrSamplesPerSec=2.634772693202372, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:53:28,752] [INFO] [logging.py:96:log_dist] [Rank 0] step=27520, skipped=0, lr=[8.132855648993217e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:53:28,754] [INFO] [timer.py:259:stop] epoch=0/micro_step=27520/global_step=27520, RunningAvgSamplesPerSec=2.636634068582293, CurrSamplesPerSec=2.592037308103606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:53:44,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=27530, skipped=0, lr=[8.131542350387969e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:53:44,056] [INFO] [timer.py:259:stop] epoch=0/micro_step=27530/global_step=27530, RunningAvgSamplesPerSec=2.6366321183028805, CurrSamplesPerSec=2.6357520536297776, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:53:59,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=27540, skipped=0, lr=[8.130228696195077e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:53:59,345] [INFO] [timer.py:259:stop] epoch=0/micro_step=27540/global_step=27540, RunningAvgSamplesPerSec=2.6366299611188913, CurrSamplesPerSec=2.596750504258881, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:54:14,675] [INFO] [logging.py:96:log_dist] [Rank 0] step=27550, skipped=0, lr=[8.128914686563707e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:54:14,676] [INFO] [timer.py:259:stop] epoch=0/micro_step=27550/global_step=27550, RunningAvgSamplesPerSec=2.636625354468827, CurrSamplesPerSec=2.630654609984362, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:54:29,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=27560, skipped=0, lr=[8.127600321643065e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:54:29,988] [INFO] [timer.py:259:stop] epoch=0/micro_step=27560/global_step=27560, RunningAvgSamplesPerSec=2.636621665643246, CurrSamplesPerSec=2.574811691355575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:54:45,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=27570, skipped=0, lr=[8.126285601582397e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:54:45,234] [INFO] [timer.py:259:stop] epoch=0/micro_step=27570/global_step=27570, RunningAvgSamplesPerSec=2.636624387696897, CurrSamplesPerSec=2.650046760251691, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:55:00,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=27580, skipped=0, lr=[8.12497052653099e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:55:00,654] [INFO] [timer.py:259:stop] epoch=0/micro_step=27580/global_step=27580, RunningAvgSamplesPerSec=2.636615573342911, CurrSamplesPerSec=2.6190602587033993, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:55:15,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=27590, skipped=0, lr=[8.12365509663817e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:55:15,917] [INFO] [timer.py:259:stop] epoch=0/micro_step=27590/global_step=27590, RunningAvgSamplesPerSec=2.6366169569198545, CurrSamplesPerSec=2.64049521861062, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:55:31,297] [INFO] [logging.py:96:log_dist] [Rank 0] step=27600, skipped=0, lr=[8.122339312053306e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:55:31,311] [INFO] [timer.py:259:stop] epoch=0/micro_step=27600/global_step=27600, RunningAvgSamplesPerSec=2.6366095356481467, CurrSamplesPerSec=2.6364789727468216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:55:46,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=27610, skipped=0, lr=[8.121023172925807e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:55:46,719] [INFO] [timer.py:259:stop] epoch=0/micro_step=27610/global_step=27610, RunningAvgSamplesPerSec=2.636599882734418, CurrSamplesPerSec=2.637731621994377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:56:02,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=27620, skipped=0, lr=[8.11970667940512e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:56:02,135] [INFO] [timer.py:259:stop] epoch=0/micro_step=27620/global_step=27620, RunningAvgSamplesPerSec=2.636590800507998, CurrSamplesPerSec=2.6383027971228628, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:56:17,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=27630, skipped=0, lr=[8.11838983164073e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:56:17,412] [INFO] [timer.py:259:stop] epoch=0/micro_step=27630/global_step=27630, RunningAvgSamplesPerSec=2.636590008089528, CurrSamplesPerSec=2.6532423455114493, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:56:32,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=27640, skipped=0, lr=[8.117072629782172e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:56:32,778] [INFO] [timer.py:259:stop] epoch=0/micro_step=27640/global_step=27640, RunningAvgSamplesPerSec=2.636583651105975, CurrSamplesPerSec=2.649959696727046, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:56:48,028] [INFO] [logging.py:96:log_dist] [Rank 0] step=27650, skipped=0, lr=[8.115755073979007e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:56:48,037] [INFO] [timer.py:259:stop] epoch=0/micro_step=27650/global_step=27650, RunningAvgSamplesPerSec=2.636583646407591, CurrSamplesPerSec=2.6742113003685617, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:57:03,279] [INFO] [logging.py:96:log_dist] [Rank 0] step=27660, skipped=0, lr=[8.11443716438085e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:57:03,281] [INFO] [timer.py:259:stop] epoch=0/micro_step=27660/global_step=27660, RunningAvgSamplesPerSec=2.6365846917352638, CurrSamplesPerSec=2.6508586518016286, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:57:18,605] [INFO] [logging.py:96:log_dist] [Rank 0] step=27670, skipped=0, lr=[8.113118901137347e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:57:18,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=27670/global_step=27670, RunningAvgSamplesPerSec=2.6365803402873613, CurrSamplesPerSec=2.5854181787595465, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:57:33,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=27680, skipped=0, lr=[8.111800284398189e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:57:33,923] [INFO] [timer.py:259:stop] epoch=0/micro_step=27680/global_step=27680, RunningAvgSamplesPerSec=2.636576604950842, CurrSamplesPerSec=2.6437819426044307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:57:49,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=27690, skipped=0, lr=[8.110481314313104e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:57:49,298] [INFO] [timer.py:259:stop] epoch=0/micro_step=27690/global_step=27690, RunningAvgSamplesPerSec=2.6365704798285132, CurrSamplesPerSec=2.563524692214729, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:58:04,592] [INFO] [logging.py:96:log_dist] [Rank 0] step=27700, skipped=0, lr=[8.109161991031862e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:58:04,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=27700/global_step=27700, RunningAvgSamplesPerSec=2.636568788524929, CurrSamplesPerSec=2.632030140113136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:58:19,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=27710, skipped=0, lr=[8.107842314704274e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:58:19,885] [INFO] [timer.py:259:stop] epoch=0/micro_step=27710/global_step=27710, RunningAvgSamplesPerSec=2.636567359254178, CurrSamplesPerSec=2.5820710043158024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:58:35,162] [INFO] [logging.py:96:log_dist] [Rank 0] step=27720, skipped=0, lr=[8.106522285480184e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:58:35,165] [INFO] [timer.py:259:stop] epoch=0/micro_step=27720/global_step=27720, RunningAvgSamplesPerSec=2.6365671240507416, CurrSamplesPerSec=2.6248509962056867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:58:50,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=27730, skipped=0, lr=[8.10520190350949e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:58:50,539] [INFO] [timer.py:259:stop] epoch=0/micro_step=27730/global_step=27730, RunningAvgSamplesPerSec=2.6365618636202863, CurrSamplesPerSec=2.596134505732743, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:59:05,820] [INFO] [logging.py:96:log_dist] [Rank 0] step=27740, skipped=0, lr=[8.103881168942117e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:59:05,822] [INFO] [timer.py:259:stop] epoch=0/micro_step=27740/global_step=27740, RunningAvgSamplesPerSec=2.6365610281951746, CurrSamplesPerSec=2.652352256160051, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:59:21,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=27750, skipped=0, lr=[8.102560081928035e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:59:21,169] [INFO] [timer.py:259:stop] epoch=0/micro_step=27750/global_step=27750, RunningAvgSamplesPerSec=2.6365557339683314, CurrSamplesPerSec=2.5826425770156565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:59:36,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=27760, skipped=0, lr=[8.101238642617255e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:59:36,617] [INFO] [timer.py:259:stop] epoch=0/micro_step=27760/global_step=27760, RunningAvgSamplesPerSec=2.6365441914798198, CurrSamplesPerSec=2.64043787031321, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 11:59:51,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=27770, skipped=0, lr=[8.099916851159826e-06], mom=[(0.9, 0.95)] +[2024-11-01 11:59:51,912] [INFO] [timer.py:259:stop] epoch=0/micro_step=27770/global_step=27770, RunningAvgSamplesPerSec=2.6365429269755385, CurrSamplesPerSec=2.647700976267598, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:00:07,215] [INFO] [logging.py:96:log_dist] [Rank 0] step=27780, skipped=0, lr=[8.098594707705839e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:00:07,218] [INFO] [timer.py:259:stop] epoch=0/micro_step=27780/global_step=27780, RunningAvgSamplesPerSec=2.636541520936414, CurrSamplesPerSec=2.626506615677565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:00:22,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=27790, skipped=0, lr=[8.097272212405422e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:00:22,542] [INFO] [timer.py:259:stop] epoch=0/micro_step=27790/global_step=27790, RunningAvgSamplesPerSec=2.63653781865534, CurrSamplesPerSec=2.6594252378960674, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:00:37,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=27800, skipped=0, lr=[8.095949365408745e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:00:37,789] [INFO] [timer.py:259:stop] epoch=0/micro_step=27800/global_step=27800, RunningAvgSamplesPerSec=2.6365382831672988, CurrSamplesPerSec=2.661821413670873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:00:53,149] [INFO] [logging.py:96:log_dist] [Rank 0] step=27810, skipped=0, lr=[8.094626166866019e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:00:53,152] [INFO] [timer.py:259:stop] epoch=0/micro_step=27810/global_step=27810, RunningAvgSamplesPerSec=2.636531791673752, CurrSamplesPerSec=2.600939639911148, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +Beginning of Epoch 3/3 +Beginning of Epoch 3/3 +[2024-11-01 12:01:08,456] [INFO] [logging.py:96:log_dist] [Rank 0] step=27820, skipped=0, lr=[8.093302616927494e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:01:08,459] [INFO] [timer.py:259:stop] epoch=0/micro_step=27820/global_step=27820, RunningAvgSamplesPerSec=2.636530169224964, CurrSamplesPerSec=2.607394619354772, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:01:23,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=27830, skipped=0, lr=[8.091978715743458e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:01:23,740] [INFO] [timer.py:259:stop] epoch=0/micro_step=27830/global_step=27830, RunningAvgSamplesPerSec=2.6365295913747273, CurrSamplesPerSec=2.665554428370419, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:01:39,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=27840, skipped=0, lr=[8.090654463464241e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:01:39,144] [INFO] [timer.py:259:stop] epoch=0/micro_step=27840/global_step=27840, RunningAvgSamplesPerSec=2.6365207696870363, CurrSamplesPerSec=2.5677294414521588, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:01:54,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=27850, skipped=0, lr=[8.089329860240212e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:01:54,466] [INFO] [timer.py:259:stop] epoch=0/micro_step=27850/global_step=27850, RunningAvgSamplesPerSec=2.636517266636862, CurrSamplesPerSec=2.6280432896095136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:02:09,792] [INFO] [logging.py:96:log_dist] [Rank 0] step=27860, skipped=0, lr=[8.08800490622178e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:02:09,799] [INFO] [timer.py:259:stop] epoch=0/micro_step=27860/global_step=27860, RunningAvgSamplesPerSec=2.636512667136718, CurrSamplesPerSec=2.5906740478575476, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:02:25,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=27870, skipped=0, lr=[8.086679601559397e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:02:25,065] [INFO] [timer.py:259:stop] epoch=0/micro_step=27870/global_step=27870, RunningAvgSamplesPerSec=2.636511952012075, CurrSamplesPerSec=2.636165789205049, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:02:40,419] [INFO] [logging.py:96:log_dist] [Rank 0] step=27880, skipped=0, lr=[8.085353946403546e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:02:40,420] [INFO] [timer.py:259:stop] epoch=0/micro_step=27880/global_step=27880, RunningAvgSamplesPerSec=2.6365081115509104, CurrSamplesPerSec=2.6309491569836565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:02:55,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=27890, skipped=0, lr=[8.08402794090476e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:02:55,776] [INFO] [timer.py:259:stop] epoch=0/micro_step=27890/global_step=27890, RunningAvgSamplesPerSec=2.6365049790242603, CurrSamplesPerSec=2.6324228819223117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:03:11,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=27900, skipped=0, lr=[8.082701585213605e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:03:11,089] [INFO] [timer.py:259:stop] epoch=0/micro_step=27900/global_step=27900, RunningAvgSamplesPerSec=2.6365018422013415, CurrSamplesPerSec=2.5960726407558985, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:03:26,415] [INFO] [logging.py:96:log_dist] [Rank 0] step=27910, skipped=0, lr=[8.081374879480691e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:03:26,436] [INFO] [timer.py:259:stop] epoch=0/micro_step=27910/global_step=27910, RunningAvgSamplesPerSec=2.636496625186333, CurrSamplesPerSec=2.6345608563797716, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:03:41,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=27920, skipped=0, lr=[8.080047823856666e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:03:41,743] [INFO] [timer.py:259:stop] epoch=0/micro_step=27920/global_step=27920, RunningAvgSamplesPerSec=2.636493683729819, CurrSamplesPerSec=2.64971862758895, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:03:56,977] [INFO] [logging.py:96:log_dist] [Rank 0] step=27930, skipped=0, lr=[8.078720418492216e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:03:56,979] [INFO] [timer.py:259:stop] epoch=0/micro_step=27930/global_step=27930, RunningAvgSamplesPerSec=2.636495747099802, CurrSamplesPerSec=2.638736839873319, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:04:12,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=27940, skipped=0, lr=[8.077392663538068e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:04:12,279] [INFO] [timer.py:259:stop] epoch=0/micro_step=27940/global_step=27940, RunningAvgSamplesPerSec=2.636493983557116, CurrSamplesPerSec=2.658768193856482, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:04:27,573] [INFO] [logging.py:96:log_dist] [Rank 0] step=27950, skipped=0, lr=[8.076064559144992e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:04:27,575] [INFO] [timer.py:259:stop] epoch=0/micro_step=27950/global_step=27950, RunningAvgSamplesPerSec=2.6364921431829265, CurrSamplesPerSec=2.624022122104222, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:04:42,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=27960, skipped=0, lr=[8.07473610546379e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:04:42,867] [INFO] [timer.py:259:stop] epoch=0/micro_step=27960/global_step=27960, RunningAvgSamplesPerSec=2.636490555906991, CurrSamplesPerSec=2.653038436304653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:04:58,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=27970, skipped=0, lr=[8.073407302645313e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:04:58,160] [INFO] [timer.py:259:stop] epoch=0/micro_step=27970/global_step=27970, RunningAvgSamplesPerSec=2.6364889569539716, CurrSamplesPerSec=2.6107192469060734, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:05:13,411] [INFO] [logging.py:96:log_dist] [Rank 0] step=27980, skipped=0, lr=[8.072078150840444e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:05:13,413] [INFO] [timer.py:259:stop] epoch=0/micro_step=27980/global_step=27980, RunningAvgSamplesPerSec=2.6364897282767674, CurrSamplesPerSec=2.635499072145957, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:05:28,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=27990, skipped=0, lr=[8.07074865020011e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:05:28,768] [INFO] [timer.py:259:stop] epoch=0/micro_step=27990/global_step=27990, RunningAvgSamplesPerSec=2.636483314429332, CurrSamplesPerSec=2.586496355708351, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:05:44,034] [INFO] [logging.py:96:log_dist] [Rank 0] step=28000, skipped=0, lr=[8.069418800875275e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:05:44,036] [INFO] [timer.py:259:stop] epoch=0/micro_step=28000/global_step=28000, RunningAvgSamplesPerSec=2.636482481452166, CurrSamplesPerSec=2.6337638773569303, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:05:59,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=28010, skipped=0, lr=[8.068088603016945e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:05:59,333] [INFO] [timer.py:259:stop] epoch=0/micro_step=28010/global_step=28010, RunningAvgSamplesPerSec=2.636479779348082, CurrSamplesPerSec=2.64169969261245, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:06:14,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=28020, skipped=0, lr=[8.066758056776165e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:06:14,597] [INFO] [timer.py:259:stop] epoch=0/micro_step=28020/global_step=28020, RunningAvgSamplesPerSec=2.6364801688257375, CurrSamplesPerSec=2.619990328932914, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:06:29,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=28030, skipped=0, lr=[8.065427162304016e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:06:29,944] [INFO] [timer.py:259:stop] epoch=0/micro_step=28030/global_step=28030, RunningAvgSamplesPerSec=2.6364766241655504, CurrSamplesPerSec=2.5920132805350526, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:06:45,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=28040, skipped=0, lr=[8.064095919751625e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:06:45,242] [INFO] [timer.py:259:stop] epoch=0/micro_step=28040/global_step=28040, RunningAvgSamplesPerSec=2.6364752511136396, CurrSamplesPerSec=2.609495363483028, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:07:00,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=28050, skipped=0, lr=[8.062764329270153e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:07:00,577] [INFO] [timer.py:259:stop] epoch=0/micro_step=28050/global_step=28050, RunningAvgSamplesPerSec=2.6364717016630737, CurrSamplesPerSec=2.6567594607785043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:07:15,965] [INFO] [logging.py:96:log_dist] [Rank 0] step=28060, skipped=0, lr=[8.061432391010803e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:07:15,968] [INFO] [timer.py:259:stop] epoch=0/micro_step=28060/global_step=28060, RunningAvgSamplesPerSec=2.636466885377221, CurrSamplesPerSec=2.6397993115992233, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:07:31,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=28070, skipped=0, lr=[8.060100105124818e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:07:31,326] [INFO] [timer.py:259:stop] epoch=0/micro_step=28070/global_step=28070, RunningAvgSamplesPerSec=2.636460869649275, CurrSamplesPerSec=2.64229089782753, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:07:46,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=28080, skipped=0, lr=[8.058767471763478e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:07:46,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=28080/global_step=28080, RunningAvgSamplesPerSec=2.6364567722183363, CurrSamplesPerSec=2.637104734105703, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:08:02,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=28090, skipped=0, lr=[8.057434491078102e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:08:02,024] [INFO] [timer.py:259:stop] epoch=0/micro_step=28090/global_step=28090, RunningAvgSamplesPerSec=2.636451472696932, CurrSamplesPerSec=2.628701296790435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:08:17,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=28100, skipped=0, lr=[8.056101163220056e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:08:17,376] [INFO] [timer.py:259:stop] epoch=0/micro_step=28100/global_step=28100, RunningAvgSamplesPerSec=2.6364461666327883, CurrSamplesPerSec=2.5495505640353606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:08:32,621] [INFO] [logging.py:96:log_dist] [Rank 0] step=28110, skipped=0, lr=[8.054767488340736e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:08:32,642] [INFO] [timer.py:259:stop] epoch=0/micro_step=28110/global_step=28110, RunningAvgSamplesPerSec=2.6364471170146846, CurrSamplesPerSec=2.6458774922405075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:08:47,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=28120, skipped=0, lr=[8.053433466591582e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:08:47,899] [INFO] [timer.py:259:stop] epoch=0/micro_step=28120/global_step=28120, RunningAvgSamplesPerSec=2.6364485649080707, CurrSamplesPerSec=2.6458887586408966, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:09:03,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=28130, skipped=0, lr=[8.05209909812407e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:09:03,207] [INFO] [timer.py:259:stop] epoch=0/micro_step=28130/global_step=28130, RunningAvgSamplesPerSec=2.6364454573531657, CurrSamplesPerSec=2.631616875991736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:09:18,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=28140, skipped=0, lr=[8.050764383089725e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:09:18,463] [INFO] [timer.py:259:stop] epoch=0/micro_step=28140/global_step=28140, RunningAvgSamplesPerSec=2.6364468042701557, CurrSamplesPerSec=2.5886426282595822, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:09:33,773] [INFO] [logging.py:96:log_dist] [Rank 0] step=28150, skipped=0, lr=[8.049429321640095e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:09:33,788] [INFO] [timer.py:259:stop] epoch=0/micro_step=28150/global_step=28150, RunningAvgSamplesPerSec=2.6364444555646154, CurrSamplesPerSec=2.6420487254864216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:09:49,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=28160, skipped=0, lr=[8.048093913926784e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:09:49,194] [INFO] [timer.py:259:stop] epoch=0/micro_step=28160/global_step=28160, RunningAvgSamplesPerSec=2.6364376105318037, CurrSamplesPerSec=2.626916630966653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:10:04,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=28170, skipped=0, lr=[8.046758160101425e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:10:04,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=28170/global_step=28170, RunningAvgSamplesPerSec=2.6364370949113347, CurrSamplesPerSec=2.624733140174915, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:10:19,814] [INFO] [logging.py:96:log_dist] [Rank 0] step=28180, skipped=0, lr=[8.045422060315694e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:10:19,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=28180/global_step=28180, RunningAvgSamplesPerSec=2.636429725907583, CurrSamplesPerSec=2.633137220733371, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:10:35,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=28190, skipped=0, lr=[8.044085614721304e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:10:35,108] [INFO] [timer.py:259:stop] epoch=0/micro_step=28190/global_step=28190, RunningAvgSamplesPerSec=2.6364298204077192, CurrSamplesPerSec=2.6362345506444655, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:10:50,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=28200, skipped=0, lr=[8.042748823470012e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:10:50,416] [INFO] [timer.py:259:stop] epoch=0/micro_step=28200/global_step=28200, RunningAvgSamplesPerSec=2.636427501710768, CurrSamplesPerSec=2.6086331623703853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:11:05,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=28210, skipped=0, lr=[8.04141168671361e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:11:05,735] [INFO] [timer.py:259:stop] epoch=0/micro_step=28210/global_step=28210, RunningAvgSamplesPerSec=2.636424524091, CurrSamplesPerSec=2.6054825615281403, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:11:21,092] [INFO] [logging.py:96:log_dist] [Rank 0] step=28220, skipped=0, lr=[8.040074204603927e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:11:21,106] [INFO] [timer.py:259:stop] epoch=0/micro_step=28220/global_step=28220, RunningAvgSamplesPerSec=2.636418488199324, CurrSamplesPerSec=2.6293163660128926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:11:36,478] [INFO] [logging.py:96:log_dist] [Rank 0] step=28230, skipped=0, lr=[8.038736377292837e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:11:36,492] [INFO] [timer.py:259:stop] epoch=0/micro_step=28230/global_step=28230, RunningAvgSamplesPerSec=2.636410997578243, CurrSamplesPerSec=2.636592085092078, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:11:51,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=28240, skipped=0, lr=[8.037398204932252e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:11:51,875] [INFO] [timer.py:259:stop] epoch=0/micro_step=28240/global_step=28240, RunningAvgSamplesPerSec=2.636405054411017, CurrSamplesPerSec=2.641777062623523, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:12:07,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=28250, skipped=0, lr=[8.03605968767412e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:12:07,267] [INFO] [timer.py:259:stop] epoch=0/micro_step=28250/global_step=28250, RunningAvgSamplesPerSec=2.6363986251466955, CurrSamplesPerSec=2.616295221294279, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:12:22,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=28260, skipped=0, lr=[8.03472082567043e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:12:22,554] [INFO] [timer.py:259:stop] epoch=0/micro_step=28260/global_step=28260, RunningAvgSamplesPerSec=2.6363969908714693, CurrSamplesPerSec=2.654965496198494, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:12:37,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=28270, skipped=0, lr=[8.033381619073213e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:12:37,840] [INFO] [timer.py:259:stop] epoch=0/micro_step=28270/global_step=28270, RunningAvgSamplesPerSec=2.6363971455147324, CurrSamplesPerSec=2.660298567876994, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:12:53,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=28280, skipped=0, lr=[8.032042068034532e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:12:53,238] [INFO] [timer.py:259:stop] epoch=0/micro_step=28280/global_step=28280, RunningAvgSamplesPerSec=2.6363909352907675, CurrSamplesPerSec=2.6532679412678726, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:13:08,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=28290, skipped=0, lr=[8.030702172706497e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:13:08,571] [INFO] [timer.py:259:stop] epoch=0/micro_step=28290/global_step=28290, RunningAvgSamplesPerSec=2.6363869270161295, CurrSamplesPerSec=2.599043476138503, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:13:23,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=28300, skipped=0, lr=[8.029361933241252e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:13:23,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=28300/global_step=28300, RunningAvgSamplesPerSec=2.636387004764579, CurrSamplesPerSec=2.6509926889511606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:13:39,157] [INFO] [logging.py:96:log_dist] [Rank 0] step=28310, skipped=0, lr=[8.02802134979098e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:13:39,174] [INFO] [timer.py:259:stop] epoch=0/micro_step=28310/global_step=28310, RunningAvgSamplesPerSec=2.636384296052419, CurrSamplesPerSec=2.636520404713891, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:13:54,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=28320, skipped=0, lr=[8.026680422507909e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:13:54,488] [INFO] [timer.py:259:stop] epoch=0/micro_step=28320/global_step=28320, RunningAvgSamplesPerSec=2.6363814747672163, CurrSamplesPerSec=2.636136794492557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:14:09,841] [INFO] [logging.py:96:log_dist] [Rank 0] step=28330, skipped=0, lr=[8.025339151544299e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:14:09,843] [INFO] [timer.py:259:stop] epoch=0/micro_step=28330/global_step=28330, RunningAvgSamplesPerSec=2.6363774087601297, CurrSamplesPerSec=2.6003837201880837, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:14:25,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=28340, skipped=0, lr=[8.02399753705245e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:14:25,135] [INFO] [timer.py:259:stop] epoch=0/micro_step=28340/global_step=28340, RunningAvgSamplesPerSec=2.636376902500593, CurrSamplesPerSec=2.632227528812639, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:14:40,506] [INFO] [logging.py:96:log_dist] [Rank 0] step=28350, skipped=0, lr=[8.022655579184708e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:14:40,509] [INFO] [timer.py:259:stop] epoch=0/micro_step=28350/global_step=28350, RunningAvgSamplesPerSec=2.6363700752783004, CurrSamplesPerSec=2.604713994199071, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:14:55,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=28360, skipped=0, lr=[8.021313278093447e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:14:55,818] [INFO] [timer.py:259:stop] epoch=0/micro_step=28360/global_step=28360, RunningAvgSamplesPerSec=2.6363680436423693, CurrSamplesPerSec=2.6327004736282604, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:15:11,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=28370, skipped=0, lr=[8.019970633931088e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:15:11,112] [INFO] [timer.py:259:stop] epoch=0/micro_step=28370/global_step=28370, RunningAvgSamplesPerSec=2.636365778307816, CurrSamplesPerSec=2.6281108045887662, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:15:26,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=28380, skipped=0, lr=[8.018627646850092e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:15:26,385] [INFO] [timer.py:259:stop] epoch=0/micro_step=28380/global_step=28380, RunningAvgSamplesPerSec=2.6363655544998217, CurrSamplesPerSec=2.647664624020225, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:15:41,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=28390, skipped=0, lr=[8.01728431700295e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:15:41,699] [INFO] [timer.py:259:stop] epoch=0/micro_step=28390/global_step=28390, RunningAvgSamplesPerSec=2.6363629136593296, CurrSamplesPerSec=2.6648092736485722, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:15:56,987] [INFO] [logging.py:96:log_dist] [Rank 0] step=28400, skipped=0, lr=[8.015940644542201e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:15:56,989] [INFO] [timer.py:259:stop] epoch=0/micro_step=28400/global_step=28400, RunningAvgSamplesPerSec=2.6363620262618013, CurrSamplesPerSec=2.6382513521909785, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:16:12,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=28410, skipped=0, lr=[8.014596629620418e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:16:12,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=28410/global_step=28410, RunningAvgSamplesPerSec=2.636357280943643, CurrSamplesPerSec=2.6358472965582074, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:16:27,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=28420, skipped=0, lr=[8.013252272390216e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:16:27,717] [INFO] [timer.py:259:stop] epoch=0/micro_step=28420/global_step=28420, RunningAvgSamplesPerSec=2.6363501597184045, CurrSamplesPerSec=2.625382917262238, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:16:43,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=28430, skipped=0, lr=[8.011907573004244e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:16:43,175] [INFO] [timer.py:259:stop] epoch=0/micro_step=28430/global_step=28430, RunningAvgSamplesPerSec=2.636340318916245, CurrSamplesPerSec=2.5859019517781654, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:16:58,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=28440, skipped=0, lr=[8.010562531615196e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:16:58,459] [INFO] [timer.py:259:stop] epoch=0/micro_step=28440/global_step=28440, RunningAvgSamplesPerSec=2.636338691566947, CurrSamplesPerSec=2.613827149072534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:17:13,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=28450, skipped=0, lr=[8.0092171483758e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:17:13,730] [INFO] [timer.py:259:stop] epoch=0/micro_step=28450/global_step=28450, RunningAvgSamplesPerSec=2.6363389219574853, CurrSamplesPerSec=2.633390575656413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:17:29,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=28460, skipped=0, lr=[8.007871423438829e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:17:29,033] [INFO] [timer.py:259:stop] epoch=0/micro_step=28460/global_step=28460, RunningAvgSamplesPerSec=2.636336808738692, CurrSamplesPerSec=2.6020505784879804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:17:44,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=28470, skipped=0, lr=[8.006525356957084e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:17:44,355] [INFO] [timer.py:259:stop] epoch=0/micro_step=28470/global_step=28470, RunningAvgSamplesPerSec=2.6363332283719845, CurrSamplesPerSec=2.632949612455473, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:17:59,700] [INFO] [logging.py:96:log_dist] [Rank 0] step=28480, skipped=0, lr=[8.005178949083416e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:17:59,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=28480/global_step=28480, RunningAvgSamplesPerSec=2.6363283128566657, CurrSamplesPerSec=2.616130810111499, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:18:15,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=28490, skipped=0, lr=[8.003832199970708e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:18:15,024] [INFO] [timer.py:259:stop] epoch=0/micro_step=28490/global_step=28490, RunningAvgSamplesPerSec=2.6363250307338344, CurrSamplesPerSec=2.650222579039445, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:18:30,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=28500, skipped=0, lr=[8.002485109771884e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:18:30,309] [INFO] [timer.py:259:stop] epoch=0/micro_step=28500/global_step=28500, RunningAvgSamplesPerSec=2.6363245909828703, CurrSamplesPerSec=2.6357023643994326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:18:45,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=28510, skipped=0, lr=[8.001137678639907e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:18:45,634] [INFO] [timer.py:259:stop] epoch=0/micro_step=28510/global_step=28510, RunningAvgSamplesPerSec=2.6363212754366856, CurrSamplesPerSec=2.6317799367412484, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:19:00,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=28520, skipped=0, lr=[7.99978990672778e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:19:00,947] [INFO] [timer.py:259:stop] epoch=0/micro_step=28520/global_step=28520, RunningAvgSamplesPerSec=2.63631847360992, CurrSamplesPerSec=2.6297857919127567, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:19:16,216] [INFO] [logging.py:96:log_dist] [Rank 0] step=28530, skipped=0, lr=[7.998441794188539e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:19:16,235] [INFO] [timer.py:259:stop] epoch=0/micro_step=28530/global_step=28530, RunningAvgSamplesPerSec=2.6363171712422213, CurrSamplesPerSec=2.614769806262925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:19:31,624] [INFO] [logging.py:96:log_dist] [Rank 0] step=28540, skipped=0, lr=[7.997093341175263e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:19:31,626] [INFO] [timer.py:259:stop] epoch=0/micro_step=28540/global_step=28540, RunningAvgSamplesPerSec=2.636311430815289, CurrSamplesPerSec=2.6411515768566, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:19:46,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=28550, skipped=0, lr=[7.995744547841074e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:19:46,933] [INFO] [timer.py:259:stop] epoch=0/micro_step=28550/global_step=28550, RunningAvgSamplesPerSec=2.6363088040343587, CurrSamplesPerSec=2.6327859935825235, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:20:02,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=28560, skipped=0, lr=[7.994395414339125e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:20:02,255] [INFO] [timer.py:259:stop] epoch=0/micro_step=28560/global_step=28560, RunningAvgSamplesPerSec=2.636304789232841, CurrSamplesPerSec=2.6275514406565406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:20:17,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=28570, skipped=0, lr=[7.993045940822608e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:20:17,618] [INFO] [timer.py:259:stop] epoch=0/micro_step=28570/global_step=28570, RunningAvgSamplesPerSec=2.6363012202374345, CurrSamplesPerSec=2.6410048136038733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:20:32,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=28580, skipped=0, lr=[7.99169612744476e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:20:32,892] [INFO] [timer.py:259:stop] epoch=0/micro_step=28580/global_step=28580, RunningAvgSamplesPerSec=2.6363009738911463, CurrSamplesPerSec=2.650738029955309, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:20:48,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=28590, skipped=0, lr=[7.99034597435885e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:20:48,257] [INFO] [timer.py:259:stop] epoch=0/micro_step=28590/global_step=28590, RunningAvgSamplesPerSec=2.6362946516217423, CurrSamplesPerSec=2.6277950783800113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:21:03,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=28600, skipped=0, lr=[7.98899548171819e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:21:03,621] [INFO] [timer.py:259:stop] epoch=0/micro_step=28600/global_step=28600, RunningAvgSamplesPerSec=2.6362899461306393, CurrSamplesPerSec=2.6248456575503574, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:21:18,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=28610, skipped=0, lr=[7.987644649676129e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:21:18,923] [INFO] [timer.py:259:stop] epoch=0/micro_step=28610/global_step=28610, RunningAvgSamplesPerSec=2.636287695505585, CurrSamplesPerSec=2.6030134329134276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:21:34,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=28620, skipped=0, lr=[7.986293478386054e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:21:34,200] [INFO] [timer.py:259:stop] epoch=0/micro_step=28620/global_step=28620, RunningAvgSamplesPerSec=2.6362869993760794, CurrSamplesPerSec=2.637158206961306, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:21:49,541] [INFO] [logging.py:96:log_dist] [Rank 0] step=28630, skipped=0, lr=[7.98494196800139e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:21:49,543] [INFO] [timer.py:259:stop] epoch=0/micro_step=28630/global_step=28630, RunningAvgSamplesPerSec=2.63628250707915, CurrSamplesPerSec=2.535212206246713, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:22:04,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=28640, skipped=0, lr=[7.983590118675605e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:22:04,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=28640/global_step=28640, RunningAvgSamplesPerSec=2.6362793515838496, CurrSamplesPerSec=2.6253586783338174, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:22:20,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=28650, skipped=0, lr=[7.982237930562196e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:22:20,220] [INFO] [timer.py:259:stop] epoch=0/micro_step=28650/global_step=28650, RunningAvgSamplesPerSec=2.636273620506933, CurrSamplesPerSec=2.508557576626834, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:22:35,494] [INFO] [logging.py:96:log_dist] [Rank 0] step=28660, skipped=0, lr=[7.980885403814708e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:22:35,496] [INFO] [timer.py:259:stop] epoch=0/micro_step=28660/global_step=28660, RunningAvgSamplesPerSec=2.6362724167288425, CurrSamplesPerSec=2.601514756900241, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:22:50,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=28670, skipped=0, lr=[7.97953253858672e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:22:50,821] [INFO] [timer.py:259:stop] epoch=0/micro_step=28670/global_step=28670, RunningAvgSamplesPerSec=2.6362687321504574, CurrSamplesPerSec=2.64948429686202, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:23:06,103] [INFO] [logging.py:96:log_dist] [Rank 0] step=28680, skipped=0, lr=[7.978179335031853e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:23:06,111] [INFO] [timer.py:259:stop] epoch=0/micro_step=28680/global_step=28680, RunningAvgSamplesPerSec=2.6362675962400903, CurrSamplesPerSec=2.6277041204989113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:23:21,422] [INFO] [logging.py:96:log_dist] [Rank 0] step=28690, skipped=0, lr=[7.97682579330376e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:23:21,424] [INFO] [timer.py:259:stop] epoch=0/micro_step=28690/global_step=28690, RunningAvgSamplesPerSec=2.6362654567979105, CurrSamplesPerSec=2.6436153084640415, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:23:36,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=28700, skipped=0, lr=[7.975471913556136e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:23:36,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=28700/global_step=28700, RunningAvgSamplesPerSec=2.6362629710330925, CurrSamplesPerSec=2.6307193716650064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:23:52,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=28710, skipped=0, lr=[7.974117695942719e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:23:52,179] [INFO] [timer.py:259:stop] epoch=0/micro_step=28710/global_step=28710, RunningAvgSamplesPerSec=2.6362531174905164, CurrSamplesPerSec=2.522197747554188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:24:07,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=28720, skipped=0, lr=[7.972763140617276e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:24:07,470] [INFO] [timer.py:259:stop] epoch=0/micro_step=28720/global_step=28720, RunningAvgSamplesPerSec=2.636251788862951, CurrSamplesPerSec=2.6397183193960196, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:24:22,844] [INFO] [logging.py:96:log_dist] [Rank 0] step=28730, skipped=0, lr=[7.971408247733618e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:24:22,858] [INFO] [timer.py:259:stop] epoch=0/micro_step=28730/global_step=28730, RunningAvgSamplesPerSec=2.6362449024116117, CurrSamplesPerSec=2.6352328936618115, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:24:38,291] [INFO] [logging.py:96:log_dist] [Rank 0] step=28740, skipped=0, lr=[7.970053017445599e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:24:38,293] [INFO] [timer.py:259:stop] epoch=0/micro_step=28740/global_step=28740, RunningAvgSamplesPerSec=2.6362358412224873, CurrSamplesPerSec=2.5976539346458436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:24:53,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=28750, skipped=0, lr=[7.968697449907097e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:24:53,604] [INFO] [timer.py:259:stop] epoch=0/micro_step=28750/global_step=28750, RunningAvgSamplesPerSec=2.636232608006718, CurrSamplesPerSec=2.647241841320319, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:25:08,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=28760, skipped=0, lr=[7.967341545272043e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:25:08,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=28760/global_step=28760, RunningAvgSamplesPerSec=2.6362293613901215, CurrSamplesPerSec=2.586842120681601, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:25:24,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=28770, skipped=0, lr=[7.965985303694399e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:25:24,215] [INFO] [timer.py:259:stop] epoch=0/micro_step=28770/global_step=28770, RunningAvgSamplesPerSec=2.636228443980585, CurrSamplesPerSec=2.6605061260555014, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:25:39,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=28780, skipped=0, lr=[7.964628725328165e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:25:39,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=28780/global_step=28780, RunningAvgSamplesPerSec=2.636227172941101, CurrSamplesPerSec=2.5684252176320435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:25:54,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=28790, skipped=0, lr=[7.963271810327386e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:25:54,781] [INFO] [timer.py:259:stop] epoch=0/micro_step=28790/global_step=28790, RunningAvgSamplesPerSec=2.6362262146842053, CurrSamplesPerSec=2.621777474252646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:26:10,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=28800, skipped=0, lr=[7.961914558846134e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:26:10,108] [INFO] [timer.py:259:stop] epoch=0/micro_step=28800/global_step=28800, RunningAvgSamplesPerSec=2.636222121822889, CurrSamplesPerSec=2.600025864292477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:26:25,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=28810, skipped=0, lr=[7.96055697103853e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:26:25,426] [INFO] [timer.py:259:stop] epoch=0/micro_step=28810/global_step=28810, RunningAvgSamplesPerSec=2.6362185563197964, CurrSamplesPerSec=2.648132265295551, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:26:40,891] [INFO] [logging.py:96:log_dist] [Rank 0] step=28820, skipped=0, lr=[7.959199047058724e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:26:40,893] [INFO] [timer.py:259:stop] epoch=0/micro_step=28820/global_step=28820, RunningAvgSamplesPerSec=2.636210062737421, CurrSamplesPerSec=2.6290460791198145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:26:56,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=28830, skipped=0, lr=[7.95784078706091e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:26:56,233] [INFO] [timer.py:259:stop] epoch=0/micro_step=28830/global_step=28830, RunningAvgSamplesPerSec=2.6362067658980135, CurrSamplesPerSec=2.6217111036471343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:27:11,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=28840, skipped=0, lr=[7.956482191199324e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:27:11,557] [INFO] [timer.py:259:stop] epoch=0/micro_step=28840/global_step=28840, RunningAvgSamplesPerSec=2.6362035639861885, CurrSamplesPerSec=2.658514566488788, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:27:26,839] [INFO] [logging.py:96:log_dist] [Rank 0] step=28850, skipped=0, lr=[7.95512325962823e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:27:26,841] [INFO] [timer.py:259:stop] epoch=0/micro_step=28850/global_step=28850, RunningAvgSamplesPerSec=2.6362020704278026, CurrSamplesPerSec=2.6249125976458205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:27:42,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=28860, skipped=0, lr=[7.953763992501935e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:27:42,256] [INFO] [timer.py:259:stop] epoch=0/micro_step=28860/global_step=28860, RunningAvgSamplesPerSec=2.6361948152471615, CurrSamplesPerSec=2.6243049232164952, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:27:57,560] [INFO] [logging.py:96:log_dist] [Rank 0] step=28870, skipped=0, lr=[7.952404389974787e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:27:57,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=28870/global_step=28870, RunningAvgSamplesPerSec=2.6361931322694234, CurrSamplesPerSec=2.629552912637546, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:28:12,925] [INFO] [logging.py:96:log_dist] [Rank 0] step=28880, skipped=0, lr=[7.951044452201166e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:28:12,928] [INFO] [timer.py:259:stop] epoch=0/micro_step=28880/global_step=28880, RunningAvgSamplesPerSec=2.63618986959177, CurrSamplesPerSec=2.63096483500313, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:28:28,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=28890, skipped=0, lr=[7.949684179335497e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:28:28,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=28890/global_step=28890, RunningAvgSamplesPerSec=2.636184419780786, CurrSamplesPerSec=2.6087390302605753, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:28:43,738] [INFO] [logging.py:96:log_dist] [Rank 0] step=28900, skipped=0, lr=[7.948323571532236e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:28:43,740] [INFO] [timer.py:259:stop] epoch=0/micro_step=28900/global_step=28900, RunningAvgSamplesPerSec=2.636176043460459, CurrSamplesPerSec=2.623711890859598, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:28:59,118] [INFO] [logging.py:96:log_dist] [Rank 0] step=28910, skipped=0, lr=[7.946962628945883e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:28:59,120] [INFO] [timer.py:259:stop] epoch=0/micro_step=28910/global_step=28910, RunningAvgSamplesPerSec=2.6361701179329473, CurrSamplesPerSec=2.562595913138533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:29:14,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=28920, skipped=0, lr=[7.945601351730972e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:29:14,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=28920/global_step=28920, RunningAvgSamplesPerSec=2.63616995714009, CurrSamplesPerSec=2.647465748934536, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:29:29,675] [INFO] [logging.py:96:log_dist] [Rank 0] step=28930, skipped=0, lr=[7.944239740042077e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:29:29,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=28930/global_step=28930, RunningAvgSamplesPerSec=2.6361682471156787, CurrSamplesPerSec=2.5956023223301683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:29:44,996] [INFO] [logging.py:96:log_dist] [Rank 0] step=28940, skipped=0, lr=[7.94287779403381e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:29:44,999] [INFO] [timer.py:259:stop] epoch=0/micro_step=28940/global_step=28940, RunningAvgSamplesPerSec=2.6361653672432745, CurrSamplesPerSec=2.6295232389494076, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:30:00,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=28950, skipped=0, lr=[7.941515513860818e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:30:00,332] [INFO] [timer.py:259:stop] epoch=0/micro_step=28950/global_step=28950, RunningAvgSamplesPerSec=2.6361621967214925, CurrSamplesPerSec=2.605128559713789, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:30:15,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=28960, skipped=0, lr=[7.94015289967779e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:30:15,614] [INFO] [timer.py:259:stop] epoch=0/micro_step=28960/global_step=28960, RunningAvgSamplesPerSec=2.636162192354114, CurrSamplesPerSec=2.6531525544677375, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:30:31,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=28970, skipped=0, lr=[7.938789951639454e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:30:31,101] [INFO] [timer.py:259:stop] epoch=0/micro_step=28970/global_step=28970, RunningAvgSamplesPerSec=2.6361498347360794, CurrSamplesPerSec=2.5181603317786156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:30:46,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=28980, skipped=0, lr=[7.93742666990057e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:30:46,434] [INFO] [timer.py:259:stop] epoch=0/micro_step=28980/global_step=28980, RunningAvgSamplesPerSec=2.636146206026632, CurrSamplesPerSec=2.6228677439978743, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:31:01,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=28990, skipped=0, lr=[7.936063054615938e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:31:01,722] [INFO] [timer.py:259:stop] epoch=0/micro_step=28990/global_step=28990, RunningAvgSamplesPerSec=2.636144984973867, CurrSamplesPerSec=2.6308311652531806, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:31:17,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=29000, skipped=0, lr=[7.9346991059404e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:31:17,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=29000/global_step=29000, RunningAvgSamplesPerSec=2.6361438582389813, CurrSamplesPerSec=2.6284282538207644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:31:32,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=29010, skipped=0, lr=[7.93333482402883e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:31:32,347] [INFO] [timer.py:259:stop] epoch=0/micro_step=29010/global_step=29010, RunningAvgSamplesPerSec=2.6361399867179625, CurrSamplesPerSec=2.6132164551305097, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:31:47,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=29020, skipped=0, lr=[7.931970209036147e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:31:47,708] [INFO] [timer.py:259:stop] epoch=0/micro_step=29020/global_step=29020, RunningAvgSamplesPerSec=2.636134566384411, CurrSamplesPerSec=2.639266930833261, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:32:03,056] [INFO] [logging.py:96:log_dist] [Rank 0] step=29030, skipped=0, lr=[7.930605261117298e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:32:03,060] [INFO] [timer.py:259:stop] epoch=0/micro_step=29030/global_step=29030, RunningAvgSamplesPerSec=2.636130134931935, CurrSamplesPerSec=2.635333894407686, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:32:18,412] [INFO] [logging.py:96:log_dist] [Rank 0] step=29040, skipped=0, lr=[7.929239980427277e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:32:18,414] [INFO] [timer.py:259:stop] epoch=0/micro_step=29040/global_step=29040, RunningAvgSamplesPerSec=2.6361260321541136, CurrSamplesPerSec=2.5859330405836713, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:32:33,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=29050, skipped=0, lr=[7.92787436712111e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:32:33,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=29050/global_step=29050, RunningAvgSamplesPerSec=2.636122580935886, CurrSamplesPerSec=2.632787233042995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:32:49,007] [INFO] [logging.py:96:log_dist] [Rank 0] step=29060, skipped=0, lr=[7.926508421353864e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:32:49,028] [INFO] [timer.py:259:stop] epoch=0/micro_step=29060/global_step=29060, RunningAvgSamplesPerSec=2.636120289049006, CurrSamplesPerSec=2.6343515354843805, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:33:04,319] [INFO] [logging.py:96:log_dist] [Rank 0] step=29070, skipped=0, lr=[7.925142143280642e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:33:04,328] [INFO] [timer.py:259:stop] epoch=0/micro_step=29070/global_step=29070, RunningAvgSamplesPerSec=2.6361194985292986, CurrSamplesPerSec=2.6349895302266804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:33:19,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=29080, skipped=0, lr=[7.923775533056586e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:33:19,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=29080/global_step=29080, RunningAvgSamplesPerSec=2.63611897058036, CurrSamplesPerSec=2.6433008437587855, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:33:34,861] [INFO] [logging.py:96:log_dist] [Rank 0] step=29090, skipped=0, lr=[7.922408590836876e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:33:34,875] [INFO] [timer.py:259:stop] epoch=0/micro_step=29090/global_step=29090, RunningAvgSamplesPerSec=2.636120917786984, CurrSamplesPerSec=2.623520700205262, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:33:50,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=29100, skipped=0, lr=[7.921041316776727e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:33:50,238] [INFO] [timer.py:259:stop] epoch=0/micro_step=29100/global_step=29100, RunningAvgSamplesPerSec=2.6361153376381314, CurrSamplesPerSec=2.585059251704935, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:34:05,523] [INFO] [logging.py:96:log_dist] [Rank 0] step=29110, skipped=0, lr=[7.919673711031393e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:34:05,525] [INFO] [timer.py:259:stop] epoch=0/micro_step=29110/global_step=29110, RunningAvgSamplesPerSec=2.6361146074110517, CurrSamplesPerSec=2.6399750194635736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:34:20,757] [INFO] [logging.py:96:log_dist] [Rank 0] step=29120, skipped=0, lr=[7.918305773756165e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:34:20,776] [INFO] [timer.py:259:stop] epoch=0/micro_step=29120/global_step=29120, RunningAvgSamplesPerSec=2.6361148536230163, CurrSamplesPerSec=2.6542401062553376, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:34:36,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=29130, skipped=0, lr=[7.916937505106377e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:34:36,051] [INFO] [timer.py:259:stop] epoch=0/micro_step=29130/global_step=29130, RunningAvgSamplesPerSec=2.6361131340859276, CurrSamplesPerSec=2.6441319426854037, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:34:51,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=29140, skipped=0, lr=[7.915568905237394e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:34:51,483] [INFO] [timer.py:259:stop] epoch=0/micro_step=29140/global_step=29140, RunningAvgSamplesPerSec=2.6361047801561814, CurrSamplesPerSec=2.6525912884106004, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:35:06,908] [INFO] [logging.py:96:log_dist] [Rank 0] step=29150, skipped=0, lr=[7.914199974304622e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:35:06,915] [INFO] [timer.py:259:stop] epoch=0/micro_step=29150/global_step=29150, RunningAvgSamplesPerSec=2.6360959507900077, CurrSamplesPerSec=2.6369028832574806, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:35:22,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=29160, skipped=0, lr=[7.9128307124635e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:35:22,198] [INFO] [timer.py:259:stop] epoch=0/micro_step=29160/global_step=29160, RunningAvgSamplesPerSec=2.6360950933272846, CurrSamplesPerSec=2.644970653148427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:35:37,481] [INFO] [logging.py:96:log_dist] [Rank 0] step=29170, skipped=0, lr=[7.91146111986951e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:35:37,484] [INFO] [timer.py:259:stop] epoch=0/micro_step=29170/global_step=29170, RunningAvgSamplesPerSec=2.636094764659051, CurrSamplesPerSec=2.623965897507418, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:35:52,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=29180, skipped=0, lr=[7.910091196678173e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:35:52,737] [INFO] [timer.py:259:stop] epoch=0/micro_step=29180/global_step=29180, RunningAvgSamplesPerSec=2.636096175049973, CurrSamplesPerSec=2.644978992901357, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:36:08,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=29190, skipped=0, lr=[7.90872094304504e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:36:08,033] [INFO] [timer.py:259:stop] epoch=0/micro_step=29190/global_step=29190, RunningAvgSamplesPerSec=2.6360948108113544, CurrSamplesPerSec=2.641014791325245, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:36:23,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=29200, skipped=0, lr=[7.907350359125705e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:36:23,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=29200/global_step=29200, RunningAvgSamplesPerSec=2.636091031641442, CurrSamplesPerSec=2.633926790777574, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:36:38,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=29210, skipped=0, lr=[7.9059794450758e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:36:38,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=29210/global_step=29210, RunningAvgSamplesPerSec=2.6360911055486067, CurrSamplesPerSec=2.591882738530918, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:36:53,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=29220, skipped=0, lr=[7.904608201050989e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:36:53,923] [INFO] [timer.py:259:stop] epoch=0/micro_step=29220/global_step=29220, RunningAvgSamplesPerSec=2.6360898410880083, CurrSamplesPerSec=2.638576235711159, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:37:09,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=29230, skipped=0, lr=[7.903236627206982e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:37:09,268] [INFO] [timer.py:259:stop] epoch=0/micro_step=29230/global_step=29230, RunningAvgSamplesPerSec=2.6360858054192113, CurrSamplesPerSec=2.588183781844617, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:37:24,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=29240, skipped=0, lr=[7.901864723699517e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:37:24,584] [INFO] [timer.py:259:stop] epoch=0/micro_step=29240/global_step=29240, RunningAvgSamplesPerSec=2.636082313158343, CurrSamplesPerSec=2.6242556646477757, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:37:39,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=29250, skipped=0, lr=[7.900492490684377e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:37:39,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=29250/global_step=29250, RunningAvgSamplesPerSec=2.6360827007986654, CurrSamplesPerSec=2.60828114244893, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:37:55,070] [INFO] [logging.py:96:log_dist] [Rank 0] step=29260, skipped=0, lr=[7.899119928317379e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:37:55,072] [INFO] [timer.py:259:stop] epoch=0/micro_step=29260/global_step=29260, RunningAvgSamplesPerSec=2.6360838926133385, CurrSamplesPerSec=2.6502380689555607, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:38:10,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=29270, skipped=0, lr=[7.897747036754377e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:38:10,333] [INFO] [timer.py:259:stop] epoch=0/micro_step=29270/global_step=29270, RunningAvgSamplesPerSec=2.6360835541942733, CurrSamplesPerSec=2.6507958265535274, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:38:25,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=29280, skipped=0, lr=[7.896373816151263e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:38:25,599] [INFO] [timer.py:259:stop] epoch=0/micro_step=29280/global_step=29280, RunningAvgSamplesPerSec=2.63608288410942, CurrSamplesPerSec=2.6544542797567616, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:38:40,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=29290, skipped=0, lr=[7.89500026666397e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:38:40,828] [INFO] [timer.py:259:stop] epoch=0/micro_step=29290/global_step=29290, RunningAvgSamplesPerSec=2.636085738073147, CurrSamplesPerSec=2.6392241669707768, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:38:56,059] [INFO] [logging.py:96:log_dist] [Rank 0] step=29300, skipped=0, lr=[7.89362638844846e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:38:56,061] [INFO] [timer.py:259:stop] epoch=0/micro_step=29300/global_step=29300, RunningAvgSamplesPerSec=2.636086783056161, CurrSamplesPerSec=2.6558031067869936, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:39:11,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=29310, skipped=0, lr=[7.892252181660741e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:39:11,336] [INFO] [timer.py:259:stop] epoch=0/micro_step=29310/global_step=29310, RunningAvgSamplesPerSec=2.636085979017697, CurrSamplesPerSec=2.642496904088105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:39:26,562] [INFO] [logging.py:96:log_dist] [Rank 0] step=29320, skipped=0, lr=[7.890877646456851e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:39:26,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=29320/global_step=29320, RunningAvgSamplesPerSec=2.636087175161233, CurrSamplesPerSec=2.673956849141568, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:39:41,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=29330, skipped=0, lr=[7.889502782992876e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:39:41,798] [INFO] [timer.py:259:stop] epoch=0/micro_step=29330/global_step=29330, RunningAvgSamplesPerSec=2.636087554565812, CurrSamplesPerSec=2.627264236640861, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:39:57,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=29340, skipped=0, lr=[7.888127591424923e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:39:57,054] [INFO] [timer.py:259:stop] epoch=0/micro_step=29340/global_step=29340, RunningAvgSamplesPerSec=2.6360870388364233, CurrSamplesPerSec=2.655857761062655, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:40:12,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=29350, skipped=0, lr=[7.886752071909153e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:40:12,281] [INFO] [timer.py:259:stop] epoch=0/micro_step=29350/global_step=29350, RunningAvgSamplesPerSec=2.6360888433499703, CurrSamplesPerSec=2.6413174847054375, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:40:27,491] [INFO] [logging.py:96:log_dist] [Rank 0] step=29360, skipped=0, lr=[7.885376224601753e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:40:27,499] [INFO] [timer.py:259:stop] epoch=0/micro_step=29360/global_step=29360, RunningAvgSamplesPerSec=2.6360906935749444, CurrSamplesPerSec=2.6525610924858354, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:40:42,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=29370, skipped=0, lr=[7.88400004965895e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:40:42,728] [INFO] [timer.py:259:stop] epoch=0/micro_step=29370/global_step=29370, RunningAvgSamplesPerSec=2.6360921910690993, CurrSamplesPerSec=2.6400385791323884, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:40:57,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=29380, skipped=0, lr=[7.882623547237012e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:40:57,916] [INFO] [timer.py:259:stop] epoch=0/micro_step=29380/global_step=29380, RunningAvgSamplesPerSec=2.6360952860870652, CurrSamplesPerSec=2.632907879447335, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:41:13,147] [INFO] [logging.py:96:log_dist] [Rank 0] step=29390, skipped=0, lr=[7.881246717492239e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:41:13,149] [INFO] [timer.py:259:stop] epoch=0/micro_step=29390/global_step=29390, RunningAvgSamplesPerSec=2.6360960749682785, CurrSamplesPerSec=2.6383090204361364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:41:28,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=29400, skipped=0, lr=[7.879869560580973e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:41:28,324] [INFO] [timer.py:259:stop] epoch=0/micro_step=29400/global_step=29400, RunningAvgSamplesPerSec=2.636100205885772, CurrSamplesPerSec=2.6382463737556554, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:41:43,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=29410, skipped=0, lr=[7.878492076659588e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:41:43,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=29410/global_step=29410, RunningAvgSamplesPerSec=2.6361040404447165, CurrSamplesPerSec=2.6625974350466244, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:41:58,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=29420, skipped=0, lr=[7.877114265884497e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:41:58,725] [INFO] [timer.py:259:stop] epoch=0/micro_step=29420/global_step=29420, RunningAvgSamplesPerSec=2.6361061580650347, CurrSamplesPerSec=2.6138943428321952, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:42:13,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=29430, skipped=0, lr=[7.875736128412155e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:42:13,910] [INFO] [timer.py:259:stop] epoch=0/micro_step=29430/global_step=29430, RunningAvgSamplesPerSec=2.6361093478565274, CurrSamplesPerSec=2.648217118725769, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:42:29,208] [INFO] [logging.py:96:log_dist] [Rank 0] step=29440, skipped=0, lr=[7.874357664399048e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:42:29,217] [INFO] [timer.py:259:stop] epoch=0/micro_step=29440/global_step=29440, RunningAvgSamplesPerSec=2.636107301771422, CurrSamplesPerSec=2.585956955558121, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:42:44,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=29450, skipped=0, lr=[7.8729788740017e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:42:44,445] [INFO] [timer.py:259:stop] epoch=0/micro_step=29450/global_step=29450, RunningAvgSamplesPerSec=2.636108219310369, CurrSamplesPerSec=2.6386376530820566, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:42:59,629] [INFO] [logging.py:96:log_dist] [Rank 0] step=29460, skipped=0, lr=[7.871599757376671e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:42:59,646] [INFO] [timer.py:259:stop] epoch=0/micro_step=29460/global_step=29460, RunningAvgSamplesPerSec=2.636111285868369, CurrSamplesPerSec=2.664022656266533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:43:14,861] [INFO] [logging.py:96:log_dist] [Rank 0] step=29470, skipped=0, lr=[7.870220314680566e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:43:14,863] [INFO] [timer.py:259:stop] epoch=0/micro_step=29470/global_step=29470, RunningAvgSamplesPerSec=2.6361131163171136, CurrSamplesPerSec=2.640710504736343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:43:30,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=29480, skipped=0, lr=[7.868840546070018e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:43:30,135] [INFO] [timer.py:259:stop] epoch=0/micro_step=29480/global_step=29480, RunningAvgSamplesPerSec=2.6361125200393682, CurrSamplesPerSec=2.592457462081357, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:43:45,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=29490, skipped=0, lr=[7.867460451701698e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:43:45,318] [INFO] [timer.py:259:stop] epoch=0/micro_step=29490/global_step=29490, RunningAvgSamplesPerSec=2.6361171138694144, CurrSamplesPerSec=2.653421945832978, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:44:00,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=29500, skipped=0, lr=[7.86608003173232e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:44:00,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=29500/global_step=29500, RunningAvgSamplesPerSec=2.6361171801610963, CurrSamplesPerSec=2.6615270916966787, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:44:15,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=29510, skipped=0, lr=[7.864699286318629e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:44:15,808] [INFO] [timer.py:259:stop] epoch=0/micro_step=29510/global_step=29510, RunningAvgSamplesPerSec=2.6361175776636028, CurrSamplesPerSec=2.6414389141849375, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:44:31,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=29520, skipped=0, lr=[7.863318215617411e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:44:31,044] [INFO] [timer.py:259:stop] epoch=0/micro_step=29520/global_step=29520, RunningAvgSamplesPerSec=2.636117728675557, CurrSamplesPerSec=2.6378137365270704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:44:46,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=29530, skipped=0, lr=[7.861936819785483e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:44:46,300] [INFO] [timer.py:259:stop] epoch=0/micro_step=29530/global_step=29530, RunningAvgSamplesPerSec=2.636117340331824, CurrSamplesPerSec=2.6478463952399123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:45:01,528] [INFO] [logging.py:96:log_dist] [Rank 0] step=29540, skipped=0, lr=[7.860555098979707e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:45:01,530] [INFO] [timer.py:259:stop] epoch=0/micro_step=29540/global_step=29540, RunningAvgSamplesPerSec=2.6361188214783047, CurrSamplesPerSec=2.6578061829719504, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:45:16,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=29550, skipped=0, lr=[7.859173053356976e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:45:16,757] [INFO] [timer.py:259:stop] epoch=0/micro_step=29550/global_step=29550, RunningAvgSamplesPerSec=2.6361198772886105, CurrSamplesPerSec=2.659319852975044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:45:32,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=29560, skipped=0, lr=[7.857790683074222e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:45:32,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=29560/global_step=29560, RunningAvgSamplesPerSec=2.6361195185435347, CurrSamplesPerSec=2.6499680679712116, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:45:47,208] [INFO] [logging.py:96:log_dist] [Rank 0] step=29570, skipped=0, lr=[7.856407988288414e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:45:47,214] [INFO] [timer.py:259:stop] epoch=0/micro_step=29570/global_step=29570, RunningAvgSamplesPerSec=2.636122052580843, CurrSamplesPerSec=2.6446858821470713, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:46:02,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=29580, skipped=0, lr=[7.855024969156558e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:46:02,395] [INFO] [timer.py:259:stop] epoch=0/micro_step=29580/global_step=29580, RunningAvgSamplesPerSec=2.636126339225713, CurrSamplesPerSec=2.648346708348845, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:46:17,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=29590, skipped=0, lr=[7.853641625835695e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:46:17,590] [INFO] [timer.py:259:stop] epoch=0/micro_step=29590/global_step=29590, RunningAvgSamplesPerSec=2.6361298871554637, CurrSamplesPerSec=2.662602505808259, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:46:32,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=29600, skipped=0, lr=[7.852257958482903e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:46:32,867] [INFO] [timer.py:259:stop] epoch=0/micro_step=29600/global_step=29600, RunningAvgSamplesPerSec=2.636127989804765, CurrSamplesPerSec=2.6310333253845117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:46:48,095] [INFO] [logging.py:96:log_dist] [Rank 0] step=29610, skipped=0, lr=[7.8508739672553e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:46:48,109] [INFO] [timer.py:259:stop] epoch=0/micro_step=29610/global_step=29610, RunningAvgSamplesPerSec=2.6361292975565265, CurrSamplesPerSec=2.6335687385713205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:47:03,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=29620, skipped=0, lr=[7.849489652310039e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:47:03,282] [INFO] [timer.py:259:stop] epoch=0/micro_step=29620/global_step=29620, RunningAvgSamplesPerSec=2.6361330731347574, CurrSamplesPerSec=2.6729684864984775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:47:18,496] [INFO] [logging.py:96:log_dist] [Rank 0] step=29630, skipped=0, lr=[7.848105013804308e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:47:18,497] [INFO] [timer.py:259:stop] epoch=0/micro_step=29630/global_step=29630, RunningAvgSamplesPerSec=2.636134776700371, CurrSamplesPerSec=2.6253944206391133, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:47:33,682] [INFO] [logging.py:96:log_dist] [Rank 0] step=29640, skipped=0, lr=[7.846720051895334e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:47:33,694] [INFO] [timer.py:259:stop] epoch=0/micro_step=29640/global_step=29640, RunningAvgSamplesPerSec=2.636137911425254, CurrSamplesPerSec=2.6448009506003176, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:47:48,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=29650, skipped=0, lr=[7.845334766740377e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:47:48,919] [INFO] [timer.py:259:stop] epoch=0/micro_step=29650/global_step=29650, RunningAvgSamplesPerSec=2.636139216017928, CurrSamplesPerSec=2.618709506661924, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:48:04,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=29660, skipped=0, lr=[7.843949158496743e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:48:04,084] [INFO] [timer.py:259:stop] epoch=0/micro_step=29660/global_step=29660, RunningAvgSamplesPerSec=2.636144728704439, CurrSamplesPerSec=2.646052758220467, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:48:19,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=29670, skipped=0, lr=[7.84256322732176e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:48:19,285] [INFO] [timer.py:259:stop] epoch=0/micro_step=29670/global_step=29670, RunningAvgSamplesPerSec=2.6361478460806134, CurrSamplesPerSec=2.585237706861505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:48:34,481] [INFO] [logging.py:96:log_dist] [Rank 0] step=29680, skipped=0, lr=[7.841176973372809e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:48:34,483] [INFO] [timer.py:259:stop] epoch=0/micro_step=29680/global_step=29680, RunningAvgSamplesPerSec=2.636150628744806, CurrSamplesPerSec=2.6578352352744057, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:48:49,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=29690, skipped=0, lr=[7.839790396807294e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:48:49,670] [INFO] [timer.py:259:stop] epoch=0/micro_step=29690/global_step=29690, RunningAvgSamplesPerSec=2.6361537674264053, CurrSamplesPerSec=2.6382675322355396, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:49:04,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=29700, skipped=0, lr=[7.838403497782663e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:49:04,882] [INFO] [timer.py:259:stop] epoch=0/micro_step=29700/global_step=29700, RunningAvgSamplesPerSec=2.6361559054051598, CurrSamplesPerSec=2.6438569348212653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:49:20,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=29710, skipped=0, lr=[7.837016276456397e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:49:20,083] [INFO] [timer.py:259:stop] epoch=0/micro_step=29710/global_step=29710, RunningAvgSamplesPerSec=2.6361582096240106, CurrSamplesPerSec=2.615673178627158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:49:35,307] [INFO] [logging.py:96:log_dist] [Rank 0] step=29720, skipped=0, lr=[7.83562873298602e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:49:35,326] [INFO] [timer.py:259:stop] epoch=0/micro_step=29720/global_step=29720, RunningAvgSamplesPerSec=2.6361579097031123, CurrSamplesPerSec=2.647308675448628, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:49:50,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=29730, skipped=0, lr=[7.834240867529084e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:49:50,551] [INFO] [timer.py:259:stop] epoch=0/micro_step=29730/global_step=29730, RunningAvgSamplesPerSec=2.636158764784893, CurrSamplesPerSec=2.658674236547651, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:50:05,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=29740, skipped=0, lr=[7.832852680243184e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:50:05,763] [INFO] [timer.py:259:stop] epoch=0/micro_step=29740/global_step=29740, RunningAvgSamplesPerSec=2.6361605919362487, CurrSamplesPerSec=2.6339032208378, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:50:20,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=29750, skipped=0, lr=[7.831464171285948e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:50:21,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=29750/global_step=29750, RunningAvgSamplesPerSec=2.636160140816321, CurrSamplesPerSec=2.6659390234461036, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:50:36,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=29760, skipped=0, lr=[7.83007534081504e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:50:36,231] [INFO] [timer.py:259:stop] epoch=0/micro_step=29760/global_step=29760, RunningAvgSamplesPerSec=2.636162152771314, CurrSamplesPerSec=2.6340016384706795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:50:51,468] [INFO] [logging.py:96:log_dist] [Rank 0] step=29770, skipped=0, lr=[7.828686188988163e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:50:51,476] [INFO] [timer.py:259:stop] epoch=0/micro_step=29770/global_step=29770, RunningAvgSamplesPerSec=2.6361622112372993, CurrSamplesPerSec=2.635681247043846, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:51:06,643] [INFO] [logging.py:96:log_dist] [Rank 0] step=29780, skipped=0, lr=[7.82729671596306e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:51:06,645] [INFO] [timer.py:259:stop] epoch=0/micro_step=29780/global_step=29780, RunningAvgSamplesPerSec=2.6361666450652823, CurrSamplesPerSec=2.637446749087752, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:51:21,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=29790, skipped=0, lr=[7.825906921897499e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:51:21,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=29790/global_step=29790, RunningAvgSamplesPerSec=2.636169930523565, CurrSamplesPerSec=2.646479335780941, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:51:37,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=29800, skipped=0, lr=[7.824516806949296e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:51:37,061] [INFO] [timer.py:259:stop] epoch=0/micro_step=29800/global_step=29800, RunningAvgSamplesPerSec=2.636170559381491, CurrSamplesPerSec=2.6475998609954163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:51:52,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=29810, skipped=0, lr=[7.8231263712763e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:51:52,296] [INFO] [timer.py:259:stop] epoch=0/micro_step=29810/global_step=29810, RunningAvgSamplesPerSec=2.636170833340942, CurrSamplesPerSec=2.6445003761965866, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:52:07,484] [INFO] [logging.py:96:log_dist] [Rank 0] step=29820, skipped=0, lr=[7.82173561503639e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:52:07,485] [INFO] [timer.py:259:stop] epoch=0/micro_step=29820/global_step=29820, RunningAvgSamplesPerSec=2.636173816293142, CurrSamplesPerSec=2.647370082278613, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:52:22,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=29830, skipped=0, lr=[7.820344538387495e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:52:22,780] [INFO] [timer.py:259:stop] epoch=0/micro_step=29830/global_step=29830, RunningAvgSamplesPerSec=2.6361713829724898, CurrSamplesPerSec=2.617066556979158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:52:37,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=29840, skipped=0, lr=[7.818953141487564e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:52:37,956] [INFO] [timer.py:259:stop] epoch=0/micro_step=29840/global_step=29840, RunningAvgSamplesPerSec=2.636175159623605, CurrSamplesPerSec=2.6509889189710765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:52:53,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=29850, skipped=0, lr=[7.817561424494595e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:52:53,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=29850/global_step=29850, RunningAvgSamplesPerSec=2.6361758053848305, CurrSamplesPerSec=2.6528282664091707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:53:08,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=29860, skipped=0, lr=[7.816169387566615e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:53:08,341] [INFO] [timer.py:259:stop] epoch=0/micro_step=29860/global_step=29860, RunningAvgSamplesPerSec=2.6361816534907923, CurrSamplesPerSec=2.6432691931778822, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:53:23,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=29870, skipped=0, lr=[7.814777030861695e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:53:23,541] [INFO] [timer.py:259:stop] epoch=0/micro_step=29870/global_step=29870, RunningAvgSamplesPerSec=2.6361840950560333, CurrSamplesPerSec=2.6348653825248833, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:53:38,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=29880, skipped=0, lr=[7.813384354537934e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:53:38,796] [INFO] [timer.py:259:stop] epoch=0/micro_step=29880/global_step=29880, RunningAvgSamplesPerSec=2.636183525367885, CurrSamplesPerSec=2.636937282757641, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:53:53,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=29890, skipped=0, lr=[7.811991358753472e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:53:53,981] [INFO] [timer.py:259:stop] epoch=0/micro_step=29890/global_step=29890, RunningAvgSamplesPerSec=2.6361871056836277, CurrSamplesPerSec=2.6466171054662206, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:54:09,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=29900, skipped=0, lr=[7.810598043666482e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:54:09,202] [INFO] [timer.py:259:stop] epoch=0/micro_step=29900/global_step=29900, RunningAvgSamplesPerSec=2.6361884451405295, CurrSamplesPerSec=2.5732217666293633, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:54:24,385] [INFO] [logging.py:96:log_dist] [Rank 0] step=29910, skipped=0, lr=[7.809204409435178e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:54:24,391] [INFO] [timer.py:259:stop] epoch=0/micro_step=29910/global_step=29910, RunningAvgSamplesPerSec=2.636191996375949, CurrSamplesPerSec=2.6408231491104632, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:54:39,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=29920, skipped=0, lr=[7.807810456217808e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:54:39,640] [INFO] [timer.py:259:stop] epoch=0/micro_step=29920/global_step=29920, RunningAvgSamplesPerSec=2.636192944656612, CurrSamplesPerSec=2.6063187922665914, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:54:54,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=29930, skipped=0, lr=[7.806416184172653e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:54:54,884] [INFO] [timer.py:259:stop] epoch=0/micro_step=29930/global_step=29930, RunningAvgSamplesPerSec=2.636193881660774, CurrSamplesPerSec=2.6307837240091643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:55:10,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=29940, skipped=0, lr=[7.805021593458037e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:55:10,146] [INFO] [timer.py:259:stop] epoch=0/micro_step=29940/global_step=29940, RunningAvgSamplesPerSec=2.6361943759638735, CurrSamplesPerSec=2.6748968973789684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:55:25,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=29950, skipped=0, lr=[7.80362668423231e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:55:25,371] [INFO] [timer.py:259:stop] epoch=0/micro_step=29950/global_step=29950, RunningAvgSamplesPerSec=2.636196285931167, CurrSamplesPerSec=2.663724463959952, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:55:40,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=29960, skipped=0, lr=[7.80223145665387e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:55:40,599] [INFO] [timer.py:259:stop] epoch=0/micro_step=29960/global_step=29960, RunningAvgSamplesPerSec=2.636196780526787, CurrSamplesPerSec=2.652640358254641, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:55:55,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=29970, skipped=0, lr=[7.800835910881146e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:55:55,743] [INFO] [timer.py:259:stop] epoch=0/micro_step=29970/global_step=29970, RunningAvgSamplesPerSec=2.6362019575378293, CurrSamplesPerSec=2.6458733195239716, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:56:10,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=29980, skipped=0, lr=[7.799440047072599e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:56:10,928] [INFO] [timer.py:259:stop] epoch=0/micro_step=29980/global_step=29980, RunningAvgSamplesPerSec=2.6362056315753155, CurrSamplesPerSec=2.657530007294577, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:56:26,119] [INFO] [logging.py:96:log_dist] [Rank 0] step=29990, skipped=0, lr=[7.798043865386731e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:56:26,120] [INFO] [timer.py:259:stop] epoch=0/micro_step=29990/global_step=29990, RunningAvgSamplesPerSec=2.6362095794769407, CurrSamplesPerSec=2.6524386382827436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:56:41,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=30000, skipped=0, lr=[7.796647365982077e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:56:41,316] [INFO] [timer.py:259:stop] epoch=0/micro_step=30000/global_step=30000, RunningAvgSamplesPerSec=2.636213099799335, CurrSamplesPerSec=2.6469444703659626, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:56:56,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=30010, skipped=0, lr=[7.795250549017215e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:56:56,634] [INFO] [timer.py:259:stop] epoch=0/micro_step=30010/global_step=30010, RunningAvgSamplesPerSec=2.63621175864455, CurrSamplesPerSec=2.6359321927962114, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:57:11,943] [INFO] [logging.py:96:log_dist] [Rank 0] step=30020, skipped=0, lr=[7.79385341465075e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:57:11,956] [INFO] [timer.py:259:stop] epoch=0/micro_step=30020/global_step=30020, RunningAvgSamplesPerSec=2.63620835817159, CurrSamplesPerSec=2.6236622442559443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:57:27,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=30030, skipped=0, lr=[7.792455963041328e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:57:27,318] [INFO] [timer.py:259:stop] epoch=0/micro_step=30030/global_step=30030, RunningAvgSamplesPerSec=2.6362043890376277, CurrSamplesPerSec=2.643716536199921, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:57:42,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=30040, skipped=0, lr=[7.791058194347631e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:57:42,583] [INFO] [timer.py:259:stop] epoch=0/micro_step=30040/global_step=30040, RunningAvgSamplesPerSec=2.6362047560345303, CurrSamplesPerSec=2.6545529791796194, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:57:57,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=30050, skipped=0, lr=[7.789660108728374e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:57:57,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=30050/global_step=30050, RunningAvgSamplesPerSec=2.6362041907214837, CurrSamplesPerSec=2.654513078529113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:58:13,119] [INFO] [logging.py:96:log_dist] [Rank 0] step=30060, skipped=0, lr=[7.788261706342313e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:58:13,121] [INFO] [timer.py:259:stop] epoch=0/micro_step=30060/global_step=30060, RunningAvgSamplesPerSec=2.6362046335970204, CurrSamplesPerSec=2.6517360033313557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:58:28,452] [INFO] [logging.py:96:log_dist] [Rank 0] step=30070, skipped=0, lr=[7.786862987348233e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:58:28,454] [INFO] [timer.py:259:stop] epoch=0/micro_step=30070/global_step=30070, RunningAvgSamplesPerSec=2.636200826651561, CurrSamplesPerSec=2.598403852954214, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:58:43,820] [INFO] [logging.py:96:log_dist] [Rank 0] step=30080, skipped=0, lr=[7.785463951904965e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:58:43,826] [INFO] [timer.py:259:stop] epoch=0/micro_step=30080/global_step=30080, RunningAvgSamplesPerSec=2.6361952644601243, CurrSamplesPerSec=2.6348211060081725, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:58:59,103] [INFO] [logging.py:96:log_dist] [Rank 0] step=30090, skipped=0, lr=[7.784064600171363e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:58:59,106] [INFO] [timer.py:259:stop] epoch=0/micro_step=30090/global_step=30090, RunningAvgSamplesPerSec=2.6361945925902583, CurrSamplesPerSec=2.6152132612778516, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:59:14,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=30100, skipped=0, lr=[7.78266493230633e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:59:14,390] [INFO] [timer.py:259:stop] epoch=0/micro_step=30100/global_step=30100, RunningAvgSamplesPerSec=2.63619441357559, CurrSamplesPerSec=2.635326029317545, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:59:29,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=30110, skipped=0, lr=[7.781264948468794e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:59:29,703] [INFO] [timer.py:259:stop] epoch=0/micro_step=30110/global_step=30110, RunningAvgSamplesPerSec=2.6361930105275793, CurrSamplesPerSec=2.552537755306669, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 12:59:44,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=30120, skipped=0, lr=[7.779864648817727e-06], mom=[(0.9, 0.95)] +[2024-11-01 12:59:44,966] [INFO] [timer.py:259:stop] epoch=0/micro_step=30120/global_step=30120, RunningAvgSamplesPerSec=2.636193420195422, CurrSamplesPerSec=2.6711500207908023, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:00:00,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=30130, skipped=0, lr=[7.77846403351213e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:00:00,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=30130/global_step=30130, RunningAvgSamplesPerSec=2.636188210266699, CurrSamplesPerSec=2.572902912474987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:00:15,596] [INFO] [logging.py:96:log_dist] [Rank 0] step=30140, skipped=0, lr=[7.777063102711047e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:00:15,598] [INFO] [timer.py:259:stop] epoch=0/micro_step=30140/global_step=30140, RunningAvgSamplesPerSec=2.6361870474095475, CurrSamplesPerSec=2.6421698061078014, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:00:31,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=30150, skipped=0, lr=[7.775661856573553e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:00:31,007] [INFO] [timer.py:259:stop] epoch=0/micro_step=30150/global_step=30150, RunningAvgSamplesPerSec=2.6361787902078113, CurrSamplesPerSec=2.595523617875417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:00:46,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=30160, skipped=0, lr=[7.774260295258758e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:00:46,314] [INFO] [timer.py:259:stop] epoch=0/micro_step=30160/global_step=30160, RunningAvgSamplesPerSec=2.636177081883501, CurrSamplesPerSec=2.6365137755116526, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:01:01,684] [INFO] [logging.py:96:log_dist] [Rank 0] step=30170, skipped=0, lr=[7.77285841892581e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:01:01,702] [INFO] [timer.py:259:stop] epoch=0/micro_step=30170/global_step=30170, RunningAvgSamplesPerSec=2.6361715821108365, CurrSamplesPerSec=2.638686208027265, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:01:17,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=30180, skipped=0, lr=[7.771456227733896e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:01:17,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=30180/global_step=30180, RunningAvgSamplesPerSec=2.6361646148153337, CurrSamplesPerSec=2.640427896950555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:01:32,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=30190, skipped=0, lr=[7.770053721842232e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:01:32,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=30190/global_step=30190, RunningAvgSamplesPerSec=2.636165323256765, CurrSamplesPerSec=2.6299502748723045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:01:47,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=30200, skipped=0, lr=[7.768650901410074e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:01:47,613] [INFO] [timer.py:259:stop] epoch=0/micro_step=30200/global_step=30200, RunningAvgSamplesPerSec=2.636164843040507, CurrSamplesPerSec=2.639048559037325, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:02:02,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=30210, skipped=0, lr=[7.767247766596712e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:02:02,915] [INFO] [timer.py:259:stop] epoch=0/micro_step=30210/global_step=30210, RunningAvgSamplesPerSec=2.6361634572683177, CurrSamplesPerSec=2.625799157162746, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:02:18,283] [INFO] [logging.py:96:log_dist] [Rank 0] step=30220, skipped=0, lr=[7.765844317561474e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:02:18,285] [INFO] [timer.py:259:stop] epoch=0/micro_step=30220/global_step=30220, RunningAvgSamplesPerSec=2.6361578157460155, CurrSamplesPerSec=2.4897559539574634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:02:33,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=30230, skipped=0, lr=[7.764440554463723e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:02:33,656] [INFO] [timer.py:259:stop] epoch=0/micro_step=30230/global_step=30230, RunningAvgSamplesPerSec=2.636152105547095, CurrSamplesPerSec=2.6509964589419672, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:02:48,932] [INFO] [logging.py:96:log_dist] [Rank 0] step=30240, skipped=0, lr=[7.763036477462854e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:02:48,935] [INFO] [timer.py:259:stop] epoch=0/micro_step=30240/global_step=30240, RunningAvgSamplesPerSec=2.636151990156338, CurrSamplesPerSec=2.58588959617843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:03:04,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=30250, skipped=0, lr=[7.761632086718298e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:03:04,193] [INFO] [timer.py:259:stop] epoch=0/micro_step=30250/global_step=30250, RunningAvgSamplesPerSec=2.63615275607993, CurrSamplesPerSec=2.633310389543059, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:03:19,470] [INFO] [logging.py:96:log_dist] [Rank 0] step=30260, skipped=0, lr=[7.76022738238953e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:03:19,472] [INFO] [timer.py:259:stop] epoch=0/micro_step=30260/global_step=30260, RunningAvgSamplesPerSec=2.636151359904043, CurrSamplesPerSec=2.631290815103053, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:03:34,852] [INFO] [logging.py:96:log_dist] [Rank 0] step=30270, skipped=0, lr=[7.758822364636052e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:03:34,854] [INFO] [timer.py:259:stop] epoch=0/micro_step=30270/global_step=30270, RunningAvgSamplesPerSec=2.6361458257769748, CurrSamplesPerSec=2.6316771441931937, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:03:50,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=30280, skipped=0, lr=[7.757417033617404e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:03:50,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=30280/global_step=30280, RunningAvgSamplesPerSec=2.636142464956698, CurrSamplesPerSec=2.607323707332423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:04:05,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=30290, skipped=0, lr=[7.756011389493161e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:04:05,461] [INFO] [timer.py:259:stop] epoch=0/micro_step=30290/global_step=30290, RunningAvgSamplesPerSec=2.6361429746890552, CurrSamplesPerSec=2.6709752414615027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:04:20,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=30300, skipped=0, lr=[7.754605432422938e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:04:20,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=30300/global_step=30300, RunningAvgSamplesPerSec=2.6361413272619654, CurrSamplesPerSec=2.6336501806926167, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:04:36,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=30310, skipped=0, lr=[7.753199162566376e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:04:36,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=30310/global_step=30310, RunningAvgSamplesPerSec=2.6361392747945342, CurrSamplesPerSec=2.6267743242968953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:04:51,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=30320, skipped=0, lr=[7.751792580083162e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:04:51,478] [INFO] [timer.py:259:stop] epoch=0/micro_step=30320/global_step=30320, RunningAvgSamplesPerSec=2.636132769934741, CurrSamplesPerSec=2.6215910715187123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:05:06,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=30330, skipped=0, lr=[7.750385685133013e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:05:06,784] [INFO] [timer.py:259:stop] epoch=0/micro_step=30330/global_step=30330, RunningAvgSamplesPerSec=2.6361311528357367, CurrSamplesPerSec=2.63512238122432, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:05:22,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=30340, skipped=0, lr=[7.74897847787568e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:05:22,064] [INFO] [timer.py:259:stop] epoch=0/micro_step=30340/global_step=30340, RunningAvgSamplesPerSec=2.6361307840573898, CurrSamplesPerSec=2.6355172885024856, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:05:37,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=30350, skipped=0, lr=[7.747570958470955e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:05:37,349] [INFO] [timer.py:259:stop] epoch=0/micro_step=30350/global_step=30350, RunningAvgSamplesPerSec=2.636129717812829, CurrSamplesPerSec=2.627066357517655, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:05:52,656] [INFO] [logging.py:96:log_dist] [Rank 0] step=30360, skipped=0, lr=[7.746163127078662e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:05:52,658] [INFO] [timer.py:259:stop] epoch=0/micro_step=30360/global_step=30360, RunningAvgSamplesPerSec=2.636126823724179, CurrSamplesPerSec=2.6359288796697466, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:06:07,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=30370, skipped=0, lr=[7.744754983858658e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:06:07,905] [INFO] [timer.py:259:stop] epoch=0/micro_step=30370/global_step=30370, RunningAvgSamplesPerSec=2.636128239603263, CurrSamplesPerSec=2.6478986329655445, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:06:23,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=30380, skipped=0, lr=[7.74334652897084e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:06:23,205] [INFO] [timer.py:259:stop] epoch=0/micro_step=30380/global_step=30380, RunningAvgSamplesPerSec=2.6361274142857125, CurrSamplesPerSec=2.6307391720509634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:06:38,533] [INFO] [logging.py:96:log_dist] [Rank 0] step=30390, skipped=0, lr=[7.741937762575139e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:06:38,535] [INFO] [timer.py:259:stop] epoch=0/micro_step=30390/global_step=30390, RunningAvgSamplesPerSec=2.6361243150943956, CurrSamplesPerSec=2.631416277070646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:06:53,833] [INFO] [logging.py:96:log_dist] [Rank 0] step=30400, skipped=0, lr=[7.74052868483152e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:06:53,843] [INFO] [timer.py:259:stop] epoch=0/micro_step=30400/global_step=30400, RunningAvgSamplesPerSec=2.636121043374154, CurrSamplesPerSec=2.6742969807744474, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:07:09,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=30410, skipped=0, lr=[7.739119295899985e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:07:09,222] [INFO] [timer.py:259:stop] epoch=0/micro_step=30410/global_step=30410, RunningAvgSamplesPerSec=2.6361140761528308, CurrSamplesPerSec=2.5377995995866525, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:07:24,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=30420, skipped=0, lr=[7.737709595940569e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:07:24,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=30420/global_step=30420, RunningAvgSamplesPerSec=2.6361125087605166, CurrSamplesPerSec=2.6460444116949002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:07:39,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=30430, skipped=0, lr=[7.736299585113344e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:07:39,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=30430/global_step=30430, RunningAvgSamplesPerSec=2.636106003538613, CurrSamplesPerSec=2.580405030658851, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:07:55,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=30440, skipped=0, lr=[7.73488926357842e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:07:55,290] [INFO] [timer.py:259:stop] epoch=0/micro_step=30440/global_step=30440, RunningAvgSamplesPerSec=2.636097424143812, CurrSamplesPerSec=2.633725012665594, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:08:10,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=30450, skipped=0, lr=[7.733478631495937e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:08:10,637] [INFO] [timer.py:259:stop] epoch=0/micro_step=30450/global_step=30450, RunningAvgSamplesPerSec=2.6360929919034044, CurrSamplesPerSec=2.627198822156374, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:08:26,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=30460, skipped=0, lr=[7.732067689026074e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:08:26,038] [INFO] [timer.py:259:stop] epoch=0/micro_step=30460/global_step=30460, RunningAvgSamplesPerSec=2.6360857921468033, CurrSamplesPerSec=2.623533007777305, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:08:41,323] [INFO] [logging.py:96:log_dist] [Rank 0] step=30470, skipped=0, lr=[7.730656436329043e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:08:41,338] [INFO] [timer.py:259:stop] epoch=0/micro_step=30470/global_step=30470, RunningAvgSamplesPerSec=2.636084806133088, CurrSamplesPerSec=2.6299374947572987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:08:56,660] [INFO] [logging.py:96:log_dist] [Rank 0] step=30480, skipped=0, lr=[7.729244873565091e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:08:56,662] [INFO] [timer.py:259:stop] epoch=0/micro_step=30480/global_step=30480, RunningAvgSamplesPerSec=2.6360822158843646, CurrSamplesPerSec=2.642208504316636, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:09:12,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=30490, skipped=0, lr=[7.727833000894502e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:09:12,006] [INFO] [timer.py:259:stop] epoch=0/micro_step=30490/global_step=30490, RunningAvgSamplesPerSec=2.6360791747875414, CurrSamplesPerSec=2.6318633325795817, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:09:27,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=30500, skipped=0, lr=[7.726420818477597e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:09:27,261] [INFO] [timer.py:259:stop] epoch=0/micro_step=30500/global_step=30500, RunningAvgSamplesPerSec=2.6360803698322326, CurrSamplesPerSec=2.6527385033893327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:09:42,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=30510, skipped=0, lr=[7.72500832647473e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:09:42,606] [INFO] [timer.py:259:stop] epoch=0/micro_step=30510/global_step=30510, RunningAvgSamplesPerSec=2.63607604282232, CurrSamplesPerSec=2.6380012090632445, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:09:57,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=30520, skipped=0, lr=[7.723595525046286e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:09:57,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=30520/global_step=30520, RunningAvgSamplesPerSec=2.636074042148443, CurrSamplesPerSec=2.630245901227861, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:10:13,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=30530, skipped=0, lr=[7.722182414352693e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:10:13,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=30530/global_step=30530, RunningAvgSamplesPerSec=2.6360732067894657, CurrSamplesPerSec=2.6532465415032496, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:10:28,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=30540, skipped=0, lr=[7.720768994554406e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:10:28,480] [INFO] [timer.py:259:stop] epoch=0/micro_step=30540/global_step=30540, RunningAvgSamplesPerSec=2.636073449188758, CurrSamplesPerSec=2.5987866229568515, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:10:43,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=30550, skipped=0, lr=[7.719355265811922e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:10:43,769] [INFO] [timer.py:259:stop] epoch=0/micro_step=30550/global_step=30550, RunningAvgSamplesPerSec=2.636071864175492, CurrSamplesPerSec=2.648983553885747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:10:59,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=30560, skipped=0, lr=[7.71794122828577e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:10:59,115] [INFO] [timer.py:259:stop] epoch=0/micro_step=30560/global_step=30560, RunningAvgSamplesPerSec=2.6360677335738045, CurrSamplesPerSec=2.604112401157384, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:11:14,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=30570, skipped=0, lr=[7.716526882136518e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:11:14,306] [INFO] [timer.py:259:stop] epoch=0/micro_step=30570/global_step=30570, RunningAvgSamplesPerSec=2.63607138878851, CurrSamplesPerSec=2.6474594823480824, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:11:29,568] [INFO] [logging.py:96:log_dist] [Rank 0] step=30580, skipped=0, lr=[7.715112227524757e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:11:29,570] [INFO] [timer.py:259:stop] epoch=0/micro_step=30580/global_step=30580, RunningAvgSamplesPerSec=2.6360710512157466, CurrSamplesPerSec=2.55014465193134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:11:44,835] [INFO] [logging.py:96:log_dist] [Rank 0] step=30590, skipped=0, lr=[7.713697264611128e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:11:44,837] [INFO] [timer.py:259:stop] epoch=0/micro_step=30590/global_step=30590, RunningAvgSamplesPerSec=2.636071031963433, CurrSamplesPerSec=2.629518293399829, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:12:00,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=30600, skipped=0, lr=[7.712281993556298e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:12:00,125] [INFO] [timer.py:259:stop] epoch=0/micro_step=30600/global_step=30600, RunningAvgSamplesPerSec=2.636069282766969, CurrSamplesPerSec=2.5930055892051, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:12:15,412] [INFO] [logging.py:96:log_dist] [Rank 0] step=30610, skipped=0, lr=[7.710866414520975e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:12:15,414] [INFO] [timer.py:259:stop] epoch=0/micro_step=30610/global_step=30610, RunningAvgSamplesPerSec=2.636067493565337, CurrSamplesPerSec=2.6379589009857565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:12:30,731] [INFO] [logging.py:96:log_dist] [Rank 0] step=30620, skipped=0, lr=[7.709450527665894e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:12:30,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=30620/global_step=30620, RunningAvgSamplesPerSec=2.6360641600252763, CurrSamplesPerSec=2.639565901394087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:12:45,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=30630, skipped=0, lr=[7.708034333151832e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:12:45,964] [INFO] [timer.py:259:stop] epoch=0/micro_step=30630/global_step=30630, RunningAvgSamplesPerSec=2.636066433172129, CurrSamplesPerSec=2.626099194702229, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:13:01,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=30640, skipped=0, lr=[7.706617831139595e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:13:01,233] [INFO] [timer.py:259:stop] epoch=0/micro_step=30640/global_step=30640, RunningAvgSamplesPerSec=2.63606651784863, CurrSamplesPerSec=2.6442761363340455, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:13:16,594] [INFO] [logging.py:96:log_dist] [Rank 0] step=30650, skipped=0, lr=[7.705201021790032e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:13:16,602] [INFO] [timer.py:259:stop] epoch=0/micro_step=30650/global_step=30650, RunningAvgSamplesPerSec=2.6360612909898964, CurrSamplesPerSec=2.5988429813760625, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:13:31,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=30660, skipped=0, lr=[7.70378390526402e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:13:31,893] [INFO] [timer.py:259:stop] epoch=0/micro_step=30660/global_step=30660, RunningAvgSamplesPerSec=2.636060038202334, CurrSamplesPerSec=2.6523883179154684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:13:47,316] [INFO] [logging.py:96:log_dist] [Rank 0] step=30670, skipped=0, lr=[7.702366481722473e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:13:47,318] [INFO] [timer.py:259:stop] epoch=0/micro_step=30670/global_step=30670, RunningAvgSamplesPerSec=2.636051299621411, CurrSamplesPerSec=2.610262288210142, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:14:02,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=30680, skipped=0, lr=[7.70094875132634e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:14:02,644] [INFO] [timer.py:259:stop] epoch=0/micro_step=30680/global_step=30680, RunningAvgSamplesPerSec=2.6360480670529114, CurrSamplesPerSec=2.6285085546882834, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:14:17,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=30690, skipped=0, lr=[7.699530714236604e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:14:17,957] [INFO] [timer.py:259:stop] epoch=0/micro_step=30690/global_step=30690, RunningAvgSamplesPerSec=2.6360455508273692, CurrSamplesPerSec=2.61803769871322, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:14:33,333] [INFO] [logging.py:96:log_dist] [Rank 0] step=30700, skipped=0, lr=[7.698112370614285e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:14:33,356] [INFO] [timer.py:259:stop] epoch=0/micro_step=30700/global_step=30700, RunningAvgSamplesPerSec=2.6360395409562263, CurrSamplesPerSec=2.64803822228045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:14:48,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=30710, skipped=0, lr=[7.696693720620437e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:14:48,660] [INFO] [timer.py:259:stop] epoch=0/micro_step=30710/global_step=30710, RunningAvgSamplesPerSec=2.6360382005390015, CurrSamplesPerSec=2.6106997467341926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:15:03,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=30720, skipped=0, lr=[7.695274764416143e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:15:03,977] [INFO] [timer.py:259:stop] epoch=0/micro_step=30720/global_step=30720, RunningAvgSamplesPerSec=2.636035633143403, CurrSamplesPerSec=2.6349411112314085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:15:19,348] [INFO] [logging.py:96:log_dist] [Rank 0] step=30730, skipped=0, lr=[7.693855502162532e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:15:19,389] [INFO] [timer.py:259:stop] epoch=0/micro_step=30730/global_step=30730, RunningAvgSamplesPerSec=2.636029027302097, CurrSamplesPerSec=2.6087726988617503, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:15:34,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=30740, skipped=0, lr=[7.69243593402076e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:15:34,660] [INFO] [timer.py:259:stop] epoch=0/micro_step=30740/global_step=30740, RunningAvgSamplesPerSec=2.636029734496445, CurrSamplesPerSec=2.6304442596460134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:15:50,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=30750, skipped=0, lr=[7.691016060152016e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:15:50,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=30750/global_step=30750, RunningAvgSamplesPerSec=2.6360242959311155, CurrSamplesPerSec=2.5729976133796417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:16:05,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=30760, skipped=0, lr=[7.689595880717533e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:16:05,348] [INFO] [timer.py:259:stop] epoch=0/micro_step=30760/global_step=30760, RunningAvgSamplesPerSec=2.636020766036694, CurrSamplesPerSec=2.6372950078570847, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:16:20,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=30770, skipped=0, lr=[7.688175395878568e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:16:20,636] [INFO] [timer.py:259:stop] epoch=0/micro_step=30770/global_step=30770, RunningAvgSamplesPerSec=2.6360195258490595, CurrSamplesPerSec=2.6640019286849586, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:16:35,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=30780, skipped=0, lr=[7.68675460579642e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:16:35,902] [INFO] [timer.py:259:stop] epoch=0/micro_step=30780/global_step=30780, RunningAvgSamplesPerSec=2.6360202022778685, CurrSamplesPerSec=2.655469764410863, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:16:51,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=30790, skipped=0, lr=[7.685333510632419e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:16:51,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=30790/global_step=30790, RunningAvgSamplesPerSec=2.636017440807668, CurrSamplesPerSec=2.6523581266115523, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:17:06,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=30800, skipped=0, lr=[7.683912110547932e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:17:06,437] [INFO] [timer.py:259:stop] epoch=0/micro_step=30800/global_step=30800, RunningAvgSamplesPerSec=2.6360201288228278, CurrSamplesPerSec=2.6466601092446482, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:17:21,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=30810, skipped=0, lr=[7.682490405704357e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:17:21,734] [INFO] [timer.py:259:stop] epoch=0/micro_step=30810/global_step=30810, RunningAvgSamplesPerSec=2.636018575887492, CurrSamplesPerSec=2.649450406027156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:17:37,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=30820, skipped=0, lr=[7.681068396263133e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:17:37,150] [INFO] [timer.py:259:stop] epoch=0/micro_step=30820/global_step=30820, RunningAvgSamplesPerSec=2.6360106132568952, CurrSamplesPerSec=2.491710200739341, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:17:52,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=30830, skipped=0, lr=[7.679646082385727e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:17:52,521] [INFO] [timer.py:259:stop] epoch=0/micro_step=30830/global_step=30830, RunningAvgSamplesPerSec=2.6360059646515084, CurrSamplesPerSec=2.6437073712297807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:18:07,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=30840, skipped=0, lr=[7.678223464233642e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:18:07,793] [INFO] [timer.py:259:stop] epoch=0/micro_step=30840/global_step=30840, RunningAvgSamplesPerSec=2.636005435854836, CurrSamplesPerSec=2.623340612608608, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:18:23,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=30850, skipped=0, lr=[7.67680054196842e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:18:23,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=30850/global_step=30850, RunningAvgSamplesPerSec=2.635999347452189, CurrSamplesPerSec=2.6067577650530738, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:18:38,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=30860, skipped=0, lr=[7.675377315751633e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:18:38,530] [INFO] [timer.py:259:stop] epoch=0/micro_step=30860/global_step=30860, RunningAvgSamplesPerSec=2.6359954356972546, CurrSamplesPerSec=2.6012783877213557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:18:53,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=30870, skipped=0, lr=[7.673953785744887e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:18:53,960] [INFO] [timer.py:259:stop] epoch=0/micro_step=30870/global_step=30870, RunningAvgSamplesPerSec=2.6359867974012996, CurrSamplesPerSec=2.6232725221459643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:19:09,291] [INFO] [logging.py:96:log_dist] [Rank 0] step=30880, skipped=0, lr=[7.672529952109826e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:19:09,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=30880/global_step=30880, RunningAvgSamplesPerSec=2.6359834396941557, CurrSamplesPerSec=2.6022116101754795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:19:24,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=30890, skipped=0, lr=[7.671105815008126e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:19:24,638] [INFO] [timer.py:259:stop] epoch=0/micro_step=30890/global_step=30890, RunningAvgSamplesPerSec=2.635980362017039, CurrSamplesPerSec=2.6321800371950315, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:19:39,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=30900, skipped=0, lr=[7.6696813746015e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:19:39,984] [INFO] [timer.py:259:stop] epoch=0/micro_step=30900/global_step=30900, RunningAvgSamplesPerSec=2.6359752910141534, CurrSamplesPerSec=2.6091930210423824, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:19:55,268] [INFO] [logging.py:96:log_dist] [Rank 0] step=30910, skipped=0, lr=[7.668256631051691e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:19:55,288] [INFO] [timer.py:259:stop] epoch=0/micro_step=30910/global_step=30910, RunningAvgSamplesPerSec=2.6359732519755896, CurrSamplesPerSec=2.653766947361352, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:20:10,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=30920, skipped=0, lr=[7.666831584520482e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:20:10,559] [INFO] [timer.py:259:stop] epoch=0/micro_step=30920/global_step=30920, RunningAvgSamplesPerSec=2.6359725847790165, CurrSamplesPerSec=2.6163719263839202, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:20:25,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=30930, skipped=0, lr=[7.665406235169683e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:20:25,908] [INFO] [timer.py:259:stop] epoch=0/micro_step=30930/global_step=30930, RunningAvgSamplesPerSec=2.6359680306411675, CurrSamplesPerSec=2.6029338745437576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:20:41,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=30940, skipped=0, lr=[7.663980583161147e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:20:41,324] [INFO] [timer.py:259:stop] epoch=0/micro_step=30940/global_step=30940, RunningAvgSamplesPerSec=2.6359615864678125, CurrSamplesPerSec=2.620948492585644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:20:56,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=30950, skipped=0, lr=[7.662554628656758e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:20:56,669] [INFO] [timer.py:259:stop] epoch=0/micro_step=30950/global_step=30950, RunningAvgSamplesPerSec=2.635957793735726, CurrSamplesPerSec=2.6328706927329644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:21:11,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=30960, skipped=0, lr=[7.661128371818431e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:21:11,940] [INFO] [timer.py:259:stop] epoch=0/micro_step=30960/global_step=30960, RunningAvgSamplesPerSec=2.635957868206895, CurrSamplesPerSec=2.640812341484367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:21:27,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=30970, skipped=0, lr=[7.65970181280812e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:21:27,270] [INFO] [timer.py:259:stop] epoch=0/micro_step=30970/global_step=30970, RunningAvgSamplesPerSec=2.6359559781231323, CurrSamplesPerSec=2.6178481510511333, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:21:42,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=30980, skipped=0, lr=[7.658274951787807e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:21:42,606] [INFO] [timer.py:259:stop] epoch=0/micro_step=30980/global_step=30980, RunningAvgSamplesPerSec=2.6359535779342003, CurrSamplesPerSec=2.6239642559495517, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:21:57,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=30990, skipped=0, lr=[7.656847788919515e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:21:57,978] [INFO] [timer.py:259:stop] epoch=0/micro_step=30990/global_step=30990, RunningAvgSamplesPerSec=2.635948075416239, CurrSamplesPerSec=2.6206107428204124, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:22:13,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=31000, skipped=0, lr=[7.655420324365302e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:22:13,353] [INFO] [timer.py:259:stop] epoch=0/micro_step=31000/global_step=31000, RunningAvgSamplesPerSec=2.6359438421395254, CurrSamplesPerSec=2.6369003965814106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:22:28,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=31010, skipped=0, lr=[7.653992558287254e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:22:28,671] [INFO] [timer.py:259:stop] epoch=0/micro_step=31010/global_step=31010, RunningAvgSamplesPerSec=2.6359437099365555, CurrSamplesPerSec=2.61534657173384, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:22:44,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=31020, skipped=0, lr=[7.652564490847492e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:22:44,033] [INFO] [timer.py:259:stop] epoch=0/micro_step=31020/global_step=31020, RunningAvgSamplesPerSec=2.635939418778722, CurrSamplesPerSec=2.633113664922499, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:22:59,436] [INFO] [logging.py:96:log_dist] [Rank 0] step=31030, skipped=0, lr=[7.651136122208178e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:22:59,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=31030/global_step=31030, RunningAvgSamplesPerSec=2.635932739688509, CurrSamplesPerSec=2.6065560783830035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:23:14,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=31040, skipped=0, lr=[7.6497074525315e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:23:14,803] [INFO] [timer.py:259:stop] epoch=0/micro_step=31040/global_step=31040, RunningAvgSamplesPerSec=2.635929914404525, CurrSamplesPerSec=2.638697828279151, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:23:30,145] [INFO] [logging.py:96:log_dist] [Rank 0] step=31050, skipped=0, lr=[7.648278481979688e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:23:30,162] [INFO] [timer.py:259:stop] epoch=0/micro_step=31050/global_step=31050, RunningAvgSamplesPerSec=2.635925252956022, CurrSamplesPerSec=2.62669001692502, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:23:45,523] [INFO] [logging.py:96:log_dist] [Rank 0] step=31060, skipped=0, lr=[7.646849210714999e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:23:45,525] [INFO] [timer.py:259:stop] epoch=0/micro_step=31060/global_step=31060, RunningAvgSamplesPerSec=2.6359207793796178, CurrSamplesPerSec=2.628241727665225, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:24:00,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=31070, skipped=0, lr=[7.645419638899727e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:24:00,997] [INFO] [timer.py:259:stop] epoch=0/micro_step=31070/global_step=31070, RunningAvgSamplesPerSec=2.6359113102522214, CurrSamplesPerSec=2.600362358907579, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:24:16,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=31080, skipped=0, lr=[7.6439897666962e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:24:16,389] [INFO] [timer.py:259:stop] epoch=0/micro_step=31080/global_step=31080, RunningAvgSamplesPerSec=2.635905860965066, CurrSamplesPerSec=2.6257564176680175, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:24:31,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=31090, skipped=0, lr=[7.642559594266786e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:24:31,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=31090/global_step=31090, RunningAvgSamplesPerSec=2.6358965041298514, CurrSamplesPerSec=2.6118296120277216, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:24:47,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=31100, skipped=0, lr=[7.641129121773874e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:24:47,203] [INFO] [timer.py:259:stop] epoch=0/micro_step=31100/global_step=31100, RunningAvgSamplesPerSec=2.635891565295987, CurrSamplesPerSec=2.640093832834372, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:25:02,511] [INFO] [logging.py:96:log_dist] [Rank 0] step=31110, skipped=0, lr=[7.639698349379898e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:25:02,513] [INFO] [timer.py:259:stop] epoch=0/micro_step=31110/global_step=31110, RunningAvgSamplesPerSec=2.635889898438415, CurrSamplesPerSec=2.637724986699837, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:25:17,958] [INFO] [logging.py:96:log_dist] [Rank 0] step=31120, skipped=0, lr=[7.638267277247325e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:25:17,961] [INFO] [timer.py:259:stop] epoch=0/micro_step=31120/global_step=31120, RunningAvgSamplesPerSec=2.6358808181317213, CurrSamplesPerSec=2.6218467163053107, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:25:33,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=31130, skipped=0, lr=[7.636835905538648e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:25:33,400] [INFO] [timer.py:259:stop] epoch=0/micro_step=31130/global_step=31130, RunningAvgSamplesPerSec=2.635873303107137, CurrSamplesPerSec=2.6289253744198966, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:25:48,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=31140, skipped=0, lr=[7.635404234416404e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:25:48,777] [INFO] [timer.py:259:stop] epoch=0/micro_step=31140/global_step=31140, RunningAvgSamplesPerSec=2.635867408113515, CurrSamplesPerSec=2.5588629293750547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:26:04,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=31150, skipped=0, lr=[7.633972264043158e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:26:04,090] [INFO] [timer.py:259:stop] epoch=0/micro_step=31150/global_step=31150, RunningAvgSamplesPerSec=2.6358648873505475, CurrSamplesPerSec=2.6208678341235356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:26:19,489] [INFO] [logging.py:96:log_dist] [Rank 0] step=31160, skipped=0, lr=[7.632539994581513e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:26:19,491] [INFO] [timer.py:259:stop] epoch=0/micro_step=31160/global_step=31160, RunningAvgSamplesPerSec=2.6358589211275945, CurrSamplesPerSec=2.6154150667944753, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:26:34,862] [INFO] [logging.py:96:log_dist] [Rank 0] step=31170, skipped=0, lr=[7.631107426194104e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:26:34,863] [INFO] [timer.py:259:stop] epoch=0/micro_step=31170/global_step=31170, RunningAvgSamplesPerSec=2.6358535176605717, CurrSamplesPerSec=2.6148720973367507, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:26:50,216] [INFO] [logging.py:96:log_dist] [Rank 0] step=31180, skipped=0, lr=[7.629674559043593e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:26:50,218] [INFO] [timer.py:259:stop] epoch=0/micro_step=31180/global_step=31180, RunningAvgSamplesPerSec=2.6358493344322267, CurrSamplesPerSec=2.6096751788545878, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:27:05,612] [INFO] [logging.py:96:log_dist] [Rank 0] step=31190, skipped=0, lr=[7.62824139329269e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:27:05,614] [INFO] [timer.py:259:stop] epoch=0/micro_step=31190/global_step=31190, RunningAvgSamplesPerSec=2.6358434210301214, CurrSamplesPerSec=2.616019446533824, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:27:21,048] [INFO] [logging.py:96:log_dist] [Rank 0] step=31200, skipped=0, lr=[7.62680792910413e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:27:21,055] [INFO] [timer.py:259:stop] epoch=0/micro_step=31200/global_step=31200, RunningAvgSamplesPerSec=2.6358348938826794, CurrSamplesPerSec=2.632904160728627, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:27:36,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=31210, skipped=0, lr=[7.62537416664068e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:27:36,429] [INFO] [timer.py:259:stop] epoch=0/micro_step=31210/global_step=31210, RunningAvgSamplesPerSec=2.6358299066950797, CurrSamplesPerSec=2.6270758188345718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:27:51,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=31220, skipped=0, lr=[7.6239401060651466e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:27:51,827] [INFO] [timer.py:259:stop] epoch=0/micro_step=31220/global_step=31220, RunningAvgSamplesPerSec=2.6358239569481823, CurrSamplesPerSec=2.6158428343683493, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:28:07,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=31230, skipped=0, lr=[7.622505747540366e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:28:07,122] [INFO] [timer.py:259:stop] epoch=0/micro_step=31230/global_step=31230, RunningAvgSamplesPerSec=2.635822882541833, CurrSamplesPerSec=2.6500618295194154, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:28:22,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=31240, skipped=0, lr=[7.621071091229213e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:28:22,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=31240/global_step=31240, RunningAvgSamplesPerSec=2.635813670244658, CurrSamplesPerSec=2.614082503739602, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:28:37,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=31250, skipped=0, lr=[7.61963613729459e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:28:37,995] [INFO] [timer.py:259:stop] epoch=0/micro_step=31250/global_step=31250, RunningAvgSamplesPerSec=2.635806426503754, CurrSamplesPerSec=2.6254342724028135, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:28:53,415] [INFO] [logging.py:96:log_dist] [Rank 0] step=31260, skipped=0, lr=[7.6182008858994405e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:28:53,417] [INFO] [timer.py:259:stop] epoch=0/micro_step=31260/global_step=31260, RunningAvgSamplesPerSec=2.63579904357965, CurrSamplesPerSec=2.6279313211936226, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:29:08,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=31270, skipped=0, lr=[7.616765337206733e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:29:08,781] [INFO] [timer.py:259:stop] epoch=0/micro_step=31270/global_step=31270, RunningAvgSamplesPerSec=2.6357953183869607, CurrSamplesPerSec=2.5746635155411, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:29:24,213] [INFO] [logging.py:96:log_dist] [Rank 0] step=31280, skipped=0, lr=[7.6153294913794785e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:29:24,216] [INFO] [timer.py:259:stop] epoch=0/micro_step=31280/global_step=31280, RunningAvgSamplesPerSec=2.63578689488156, CurrSamplesPerSec=2.619204184159389, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:29:39,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=31290, skipped=0, lr=[7.6138933485807165e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:29:39,633] [INFO] [timer.py:259:stop] epoch=0/micro_step=31290/global_step=31290, RunningAvgSamplesPerSec=2.635779458075525, CurrSamplesPerSec=2.5782878432344227, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:29:55,041] [INFO] [logging.py:96:log_dist] [Rank 0] step=31300, skipped=0, lr=[7.612456908973522e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:29:55,043] [INFO] [timer.py:259:stop] epoch=0/micro_step=31300/global_step=31300, RunningAvgSamplesPerSec=2.6357725293950973, CurrSamplesPerSec=2.612226922864214, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:30:10,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=31310, skipped=0, lr=[7.611020172721001e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:30:10,424] [INFO] [timer.py:259:stop] epoch=0/micro_step=31310/global_step=31310, RunningAvgSamplesPerSec=2.635766554865526, CurrSamplesPerSec=2.6004675565294506, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:30:25,806] [INFO] [logging.py:96:log_dist] [Rank 0] step=31320, skipped=0, lr=[7.609583139986296e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:30:25,808] [INFO] [timer.py:259:stop] epoch=0/micro_step=31320/global_step=31320, RunningAvgSamplesPerSec=2.6357608950547187, CurrSamplesPerSec=2.6348736586743837, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:30:41,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=31330, skipped=0, lr=[7.608145810932584e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:30:41,235] [INFO] [timer.py:259:stop] epoch=0/micro_step=31330/global_step=31330, RunningAvgSamplesPerSec=2.635752761044389, CurrSamplesPerSec=2.626026850611792, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:30:56,561] [INFO] [logging.py:96:log_dist] [Rank 0] step=31340, skipped=0, lr=[7.606708185723074e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:30:56,564] [INFO] [timer.py:259:stop] epoch=0/micro_step=31340/global_step=31340, RunningAvgSamplesPerSec=2.6357497481148418, CurrSamplesPerSec=2.6193055956838993, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:31:11,905] [INFO] [logging.py:96:log_dist] [Rank 0] step=31350, skipped=0, lr=[7.605270264521009e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:31:11,917] [INFO] [timer.py:259:stop] epoch=0/micro_step=31350/global_step=31350, RunningAvgSamplesPerSec=2.635745961759056, CurrSamplesPerSec=2.6315685808637683, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:31:27,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=31360, skipped=0, lr=[7.603832047489663e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:31:27,359] [INFO] [timer.py:259:stop] epoch=0/micro_step=31360/global_step=31360, RunningAvgSamplesPerSec=2.6357369282675167, CurrSamplesPerSec=2.6283936641897907, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:31:42,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=31370, skipped=0, lr=[7.602393534792349e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:31:42,716] [INFO] [timer.py:259:stop] epoch=0/micro_step=31370/global_step=31370, RunningAvgSamplesPerSec=2.6357326829542416, CurrSamplesPerSec=2.6280317630085146, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:31:58,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=31380, skipped=0, lr=[7.600954726592409e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:31:58,057] [INFO] [timer.py:259:stop] epoch=0/micro_step=31380/global_step=31380, RunningAvgSamplesPerSec=2.6357294313857627, CurrSamplesPerSec=2.6209624138378103, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:32:13,453] [INFO] [logging.py:96:log_dist] [Rank 0] step=31390, skipped=0, lr=[7.599515623053219e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:32:13,456] [INFO] [timer.py:259:stop] epoch=0/micro_step=31390/global_step=31390, RunningAvgSamplesPerSec=2.6357232867677416, CurrSamplesPerSec=2.6342030453746617, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:32:28,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=31400, skipped=0, lr=[7.5980762243381934e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:32:28,915] [INFO] [timer.py:259:stop] epoch=0/micro_step=31400/global_step=31400, RunningAvgSamplesPerSec=2.635713027490519, CurrSamplesPerSec=2.5863412501352427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:32:44,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=31410, skipped=0, lr=[7.596636530610772e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:32:44,259] [INFO] [timer.py:259:stop] epoch=0/micro_step=31410/global_step=31410, RunningAvgSamplesPerSec=2.635710306681078, CurrSamplesPerSec=2.61997846372066, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:32:59,696] [INFO] [logging.py:96:log_dist] [Rank 0] step=31420, skipped=0, lr=[7.595196542034435e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:32:59,697] [INFO] [timer.py:259:stop] epoch=0/micro_step=31420/global_step=31420, RunningAvgSamplesPerSec=2.635701794941091, CurrSamplesPerSec=2.6025418076389553, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:33:15,055] [INFO] [logging.py:96:log_dist] [Rank 0] step=31430, skipped=0, lr=[7.593756258772695e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:33:15,058] [INFO] [timer.py:259:stop] epoch=0/micro_step=31430/global_step=31430, RunningAvgSamplesPerSec=2.635697929390278, CurrSamplesPerSec=2.618922890068125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:33:30,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=31440, skipped=0, lr=[7.592315680989094e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:33:30,414] [INFO] [timer.py:259:stop] epoch=0/micro_step=31440/global_step=31440, RunningAvgSamplesPerSec=2.635694022464052, CurrSamplesPerSec=2.59953115374978, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:33:45,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=31450, skipped=0, lr=[7.5908748088472105e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:33:45,797] [INFO] [timer.py:259:stop] epoch=0/micro_step=31450/global_step=31450, RunningAvgSamplesPerSec=2.635688942699137, CurrSamplesPerSec=2.6190823371496106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:34:01,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=31460, skipped=0, lr=[7.589433642510657e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:34:01,167] [INFO] [timer.py:259:stop] epoch=0/micro_step=31460/global_step=31460, RunningAvgSamplesPerSec=2.635683911422564, CurrSamplesPerSec=2.615384080489319, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:34:16,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=31470, skipped=0, lr=[7.5879921821430756e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:34:16,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=31470/global_step=31470, RunningAvgSamplesPerSec=2.635678723635704, CurrSamplesPerSec=2.6191706546569677, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:34:32,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=31480, skipped=0, lr=[7.586550427908148e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:34:32,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=31480/global_step=31480, RunningAvgSamplesPerSec=2.6356657263601297, CurrSamplesPerSec=2.6303852850993446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:34:47,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=31490, skipped=0, lr=[7.5851083799695855e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:34:47,441] [INFO] [timer.py:259:stop] epoch=0/micro_step=31490/global_step=31490, RunningAvgSamplesPerSec=2.6356590296684956, CurrSamplesPerSec=2.631174030668202, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:35:02,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=31500, skipped=0, lr=[7.583666038491131e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:35:02,738] [INFO] [timer.py:259:stop] epoch=0/micro_step=31500/global_step=31500, RunningAvgSamplesPerSec=2.6356580189743486, CurrSamplesPerSec=2.6305032368372183, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:35:18,092] [INFO] [logging.py:96:log_dist] [Rank 0] step=31510, skipped=0, lr=[7.582223403636566e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:35:18,094] [INFO] [timer.py:259:stop] epoch=0/micro_step=31510/global_step=31510, RunningAvgSamplesPerSec=2.635653657182371, CurrSamplesPerSec=2.6221888830900784, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:35:33,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=31520, skipped=0, lr=[7.580780475569698e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:35:33,442] [INFO] [timer.py:259:stop] epoch=0/micro_step=31520/global_step=31520, RunningAvgSamplesPerSec=2.635649258170417, CurrSamplesPerSec=2.6314129752866386, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:35:48,852] [INFO] [logging.py:96:log_dist] [Rank 0] step=31530, skipped=0, lr=[7.579337254454374e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:35:48,854] [INFO] [timer.py:259:stop] epoch=0/micro_step=31530/global_step=31530, RunningAvgSamplesPerSec=2.635642042563786, CurrSamplesPerSec=2.5889841730873515, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:36:04,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=31540, skipped=0, lr=[7.577893740454474e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:36:04,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=31540/global_step=31540, RunningAvgSamplesPerSec=2.635636151870987, CurrSamplesPerSec=2.630869119480351, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:36:19,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=31550, skipped=0, lr=[7.576449933733907e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:36:19,635] [INFO] [timer.py:259:stop] epoch=0/micro_step=31550/global_step=31550, RunningAvgSamplesPerSec=2.6356301701287483, CurrSamplesPerSec=2.592256780512224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:36:35,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=31560, skipped=0, lr=[7.575005834456618e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:36:35,022] [INFO] [timer.py:259:stop] epoch=0/micro_step=31560/global_step=31560, RunningAvgSamplesPerSec=2.63562462693963, CurrSamplesPerSec=2.6116202286514802, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:36:50,403] [INFO] [logging.py:96:log_dist] [Rank 0] step=31570, skipped=0, lr=[7.573561442786586e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:36:50,405] [INFO] [timer.py:259:stop] epoch=0/micro_step=31570/global_step=31570, RunningAvgSamplesPerSec=2.6356185789132573, CurrSamplesPerSec=2.6357280369345166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:37:05,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=31580, skipped=0, lr=[7.572116758887822e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:37:05,732] [INFO] [timer.py:259:stop] epoch=0/micro_step=31580/global_step=31580, RunningAvgSamplesPerSec=2.635615504349191, CurrSamplesPerSec=2.608700900605058, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:37:21,081] [INFO] [logging.py:96:log_dist] [Rank 0] step=31590, skipped=0, lr=[7.57067178292437e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:37:21,084] [INFO] [timer.py:259:stop] epoch=0/micro_step=31590/global_step=31590, RunningAvgSamplesPerSec=2.6356109074644403, CurrSamplesPerSec=2.607299800720005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:37:36,488] [INFO] [logging.py:96:log_dist] [Rank 0] step=31600, skipped=0, lr=[7.569226515060307e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:37:36,490] [INFO] [timer.py:259:stop] epoch=0/micro_step=31600/global_step=31600, RunningAvgSamplesPerSec=2.635604040211029, CurrSamplesPerSec=2.6202305203002756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:37:51,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=31610, skipped=0, lr=[7.567780955459745e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:37:51,920] [INFO] [timer.py:259:stop] epoch=0/micro_step=31610/global_step=31610, RunningAvgSamplesPerSec=2.6355957287371687, CurrSamplesPerSec=2.5167060227501112, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:38:07,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=31620, skipped=0, lr=[7.566335104286825e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:38:07,296] [INFO] [timer.py:259:stop] epoch=0/micro_step=31620/global_step=31620, RunningAvgSamplesPerSec=2.635590644486572, CurrSamplesPerSec=2.6143774251114467, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:38:22,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=31630, skipped=0, lr=[7.564888961705726e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:38:22,674] [INFO] [timer.py:259:stop] epoch=0/micro_step=31630/global_step=31630, RunningAvgSamplesPerSec=2.6355860042997405, CurrSamplesPerSec=2.630116839987073, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:38:37,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=31640, skipped=0, lr=[7.56344252788066e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:38:37,996] [INFO] [timer.py:259:stop] epoch=0/micro_step=31640/global_step=31640, RunningAvgSamplesPerSec=2.635583674609382, CurrSamplesPerSec=2.6261218030479387, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:38:53,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=31650, skipped=0, lr=[7.561995802975867e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:38:53,398] [INFO] [timer.py:259:stop] epoch=0/micro_step=31650/global_step=31650, RunningAvgSamplesPerSec=2.6355767755382793, CurrSamplesPerSec=2.6380207044184316, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:39:08,844] [INFO] [logging.py:96:log_dist] [Rank 0] step=31660, skipped=0, lr=[7.560548787155623e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:39:08,846] [INFO] [timer.py:259:stop] epoch=0/micro_step=31660/global_step=31660, RunningAvgSamplesPerSec=2.6355676164375454, CurrSamplesPerSec=2.573172039122139, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:39:24,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=31670, skipped=0, lr=[7.559101480584236e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:39:24,212] [INFO] [timer.py:259:stop] epoch=0/micro_step=31670/global_step=31670, RunningAvgSamplesPerSec=2.6355632281813204, CurrSamplesPerSec=2.6236331136524425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:39:39,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=31680, skipped=0, lr=[7.557653883426053e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:39:39,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=31680/global_step=31680, RunningAvgSamplesPerSec=2.635556582466198, CurrSamplesPerSec=2.5483945722697983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:39:55,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=31690, skipped=0, lr=[7.556205995845445e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:39:55,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=31690/global_step=31690, RunningAvgSamplesPerSec=2.635549171875511, CurrSamplesPerSec=2.631825349661172, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:40:10,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=31700, skipped=0, lr=[7.554757818006821e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:40:10,417] [INFO] [timer.py:259:stop] epoch=0/micro_step=31700/global_step=31700, RunningAvgSamplesPerSec=2.6355434396192794, CurrSamplesPerSec=2.5887892218421906, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:40:25,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=31710, skipped=0, lr=[7.553309350074622e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:40:25,753] [INFO] [timer.py:259:stop] epoch=0/micro_step=31710/global_step=31710, RunningAvgSamplesPerSec=2.635540214343516, CurrSamplesPerSec=2.624824713804545, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:40:41,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=31720, skipped=0, lr=[7.551860592213323e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:40:41,194] [INFO] [timer.py:259:stop] epoch=0/micro_step=31720/global_step=31720, RunningAvgSamplesPerSec=2.635532064617258, CurrSamplesPerSec=2.619094603113906, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:40:56,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=31730, skipped=0, lr=[7.55041154458743e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:40:56,481] [INFO] [timer.py:259:stop] epoch=0/micro_step=31730/global_step=31730, RunningAvgSamplesPerSec=2.6355309245617273, CurrSamplesPerSec=2.6208056035761014, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:41:11,893] [INFO] [logging.py:96:log_dist] [Rank 0] step=31740, skipped=0, lr=[7.548962207361483e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:41:11,895] [INFO] [timer.py:259:stop] epoch=0/micro_step=31740/global_step=31740, RunningAvgSamplesPerSec=2.6355233753024083, CurrSamplesPerSec=2.6218274592726423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:41:27,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=31750, skipped=0, lr=[7.547512580700056e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:41:27,280] [INFO] [timer.py:259:stop] epoch=0/micro_step=31750/global_step=31750, RunningAvgSamplesPerSec=2.635517640346172, CurrSamplesPerSec=2.608228834233027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:41:42,677] [INFO] [logging.py:96:log_dist] [Rank 0] step=31760, skipped=0, lr=[7.546062664767751e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:41:42,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=31760/global_step=31760, RunningAvgSamplesPerSec=2.6355103146655003, CurrSamplesPerSec=2.6236113686940294, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:41:58,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=31770, skipped=0, lr=[7.544612459729212e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:41:58,064] [INFO] [timer.py:259:stop] epoch=0/micro_step=31770/global_step=31770, RunningAvgSamplesPerSec=2.6355058193702474, CurrSamplesPerSec=2.627374090933617, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:42:13,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=31780, skipped=0, lr=[7.543161965749104e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:42:13,415] [INFO] [timer.py:259:stop] epoch=0/micro_step=31780/global_step=31780, RunningAvgSamplesPerSec=2.6355020684252377, CurrSamplesPerSec=2.6089511975285724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:42:28,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=31790, skipped=0, lr=[7.541711182992135e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:42:28,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=31790/global_step=31790, RunningAvgSamplesPerSec=2.6354949754896553, CurrSamplesPerSec=2.5998421387266553, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:42:44,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=31800, skipped=0, lr=[7.540260111623042e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:42:44,125] [INFO] [timer.py:259:stop] epoch=0/micro_step=31800/global_step=31800, RunningAvgSamplesPerSec=2.6354944504775286, CurrSamplesPerSec=2.6408800984475778, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:42:59,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=31810, skipped=0, lr=[7.538808751806592e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:42:59,564] [INFO] [timer.py:259:stop] epoch=0/micro_step=31810/global_step=31810, RunningAvgSamplesPerSec=2.635486369782675, CurrSamplesPerSec=2.549021427293116, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:43:14,965] [INFO] [logging.py:96:log_dist] [Rank 0] step=31820, skipped=0, lr=[7.53735710370759e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:43:14,967] [INFO] [timer.py:259:stop] epoch=0/micro_step=31820/global_step=31820, RunningAvgSamplesPerSec=2.6354802995968205, CurrSamplesPerSec=2.6074626985251976, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:43:30,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=31830, skipped=0, lr=[7.535905167490869e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:43:30,363] [INFO] [timer.py:259:stop] epoch=0/micro_step=31830/global_step=31830, RunningAvgSamplesPerSec=2.635474030260826, CurrSamplesPerSec=2.6315714702660147, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:43:45,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=31840, skipped=0, lr=[7.534452943321299e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:43:45,717] [INFO] [timer.py:259:stop] epoch=0/micro_step=31840/global_step=31840, RunningAvgSamplesPerSec=2.6354701357787684, CurrSamplesPerSec=2.6163352053010955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:44:01,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=31850, skipped=0, lr=[7.53300043136378e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:44:01,197] [INFO] [timer.py:259:stop] epoch=0/micro_step=31850/global_step=31850, RunningAvgSamplesPerSec=2.6354600041808207, CurrSamplesPerSec=2.6265991355271865, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:44:16,560] [INFO] [logging.py:96:log_dist] [Rank 0] step=31860, skipped=0, lr=[7.531547631783242e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:44:16,564] [INFO] [timer.py:259:stop] epoch=0/micro_step=31860/global_step=31860, RunningAvgSamplesPerSec=2.635454573066588, CurrSamplesPerSec=2.625259262327055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:44:31,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=31870, skipped=0, lr=[7.530094544744655e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:44:31,922] [INFO] [timer.py:259:stop] epoch=0/micro_step=31870/global_step=31870, RunningAvgSamplesPerSec=2.635450716135986, CurrSamplesPerSec=2.630945443795886, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:44:47,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=31880, skipped=0, lr=[7.528641170413016e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:44:47,339] [INFO] [timer.py:259:stop] epoch=0/micro_step=31880/global_step=31880, RunningAvgSamplesPerSec=2.635443438525416, CurrSamplesPerSec=2.612703285831441, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:45:02,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=31890, skipped=0, lr=[7.527187508953356e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:45:02,765] [INFO] [timer.py:259:stop] epoch=0/micro_step=31890/global_step=31890, RunningAvgSamplesPerSec=2.635436567379382, CurrSamplesPerSec=2.626664931385933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:45:18,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=31900, skipped=0, lr=[7.525733560530739e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:45:18,119] [INFO] [timer.py:259:stop] epoch=0/micro_step=31900/global_step=31900, RunningAvgSamplesPerSec=2.635432487201011, CurrSamplesPerSec=2.618131665704062, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:45:33,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=31910, skipped=0, lr=[7.524279325310261e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:45:33,506] [INFO] [timer.py:259:stop] epoch=0/micro_step=31910/global_step=31910, RunningAvgSamplesPerSec=2.635426668991617, CurrSamplesPerSec=2.626346674512106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:45:48,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=31920, skipped=0, lr=[7.52282480345705e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:45:48,862] [INFO] [timer.py:259:stop] epoch=0/micro_step=31920/global_step=31920, RunningAvgSamplesPerSec=2.6354221281211734, CurrSamplesPerSec=2.613804344702937, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:46:04,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=31930, skipped=0, lr=[7.521369995136269e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:46:04,207] [INFO] [timer.py:259:stop] epoch=0/micro_step=31930/global_step=31930, RunningAvgSamplesPerSec=2.6354191683118917, CurrSamplesPerSec=2.6318331938697104, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:46:19,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=31940, skipped=0, lr=[7.519914900513112e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:46:19,547] [INFO] [timer.py:259:stop] epoch=0/micro_step=31940/global_step=31940, RunningAvgSamplesPerSec=2.635416201302311, CurrSamplesPerSec=2.608806368331996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:46:34,926] [INFO] [logging.py:96:log_dist] [Rank 0] step=31950, skipped=0, lr=[7.518459519752805e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:46:34,929] [INFO] [timer.py:259:stop] epoch=0/micro_step=31950/global_step=31950, RunningAvgSamplesPerSec=2.635411255754024, CurrSamplesPerSec=2.593620504573702, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:46:50,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=31960, skipped=0, lr=[7.5170038530206054e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:46:50,343] [INFO] [timer.py:259:stop] epoch=0/micro_step=31960/global_step=31960, RunningAvgSamplesPerSec=2.6354039239236626, CurrSamplesPerSec=2.6290913977392685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:47:05,676] [INFO] [logging.py:96:log_dist] [Rank 0] step=31970, skipped=0, lr=[7.515547900481807e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:47:05,683] [INFO] [timer.py:259:stop] epoch=0/micro_step=31970/global_step=31970, RunningAvgSamplesPerSec=2.635400346194524, CurrSamplesPerSec=2.606739539071238, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:47:21,090] [INFO] [logging.py:96:log_dist] [Rank 0] step=31980, skipped=0, lr=[7.514091662301734e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:47:21,099] [INFO] [timer.py:259:stop] epoch=0/micro_step=31980/global_step=31980, RunningAvgSamplesPerSec=2.635392385592967, CurrSamplesPerSec=2.6202914958158, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:47:36,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=31990, skipped=0, lr=[7.51263513864574e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:47:36,424] [INFO] [timer.py:259:stop] epoch=0/micro_step=31990/global_step=31990, RunningAvgSamplesPerSec=2.635390190579027, CurrSamplesPerSec=2.636914073357836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:47:51,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=32000, skipped=0, lr=[7.511178329679214e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:47:51,817] [INFO] [timer.py:259:stop] epoch=0/micro_step=32000/global_step=32000, RunningAvgSamplesPerSec=2.6353838012638886, CurrSamplesPerSec=2.631469106741667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:48:07,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=32010, skipped=0, lr=[7.509721235567579e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:48:07,215] [INFO] [timer.py:259:stop] epoch=0/micro_step=32010/global_step=32010, RunningAvgSamplesPerSec=2.635377424947692, CurrSamplesPerSec=2.611666168148359, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:48:22,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=32020, skipped=0, lr=[7.508263856476287e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:48:22,674] [INFO] [timer.py:259:stop] epoch=0/micro_step=32020/global_step=32020, RunningAvgSamplesPerSec=2.6353678867178263, CurrSamplesPerSec=2.6217053680735427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:48:38,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=32030, skipped=0, lr=[7.506806192570827e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:48:38,213] [INFO] [timer.py:259:stop] epoch=0/micro_step=32030/global_step=32030, RunningAvgSamplesPerSec=2.6353557234048623, CurrSamplesPerSec=2.6318426895531353, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:48:53,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=32040, skipped=0, lr=[7.505348244016714e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:48:53,522] [INFO] [timer.py:259:stop] epoch=0/micro_step=32040/global_step=32040, RunningAvgSamplesPerSec=2.635354102847107, CurrSamplesPerSec=2.634315135245706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:49:09,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=32050, skipped=0, lr=[7.5038900109795e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:49:09,039] [INFO] [timer.py:259:stop] epoch=0/micro_step=32050/global_step=32050, RunningAvgSamplesPerSec=2.635341662140313, CurrSamplesPerSec=2.549930701889658, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:49:24,436] [INFO] [logging.py:96:log_dist] [Rank 0] step=32060, skipped=0, lr=[7.502431493624766e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:49:24,439] [INFO] [timer.py:259:stop] epoch=0/micro_step=32060/global_step=32060, RunningAvgSamplesPerSec=2.635336469306396, CurrSamplesPerSec=2.6144536103682325, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:49:39,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=32070, skipped=0, lr=[7.500972692118128e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:49:39,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=32070/global_step=32070, RunningAvgSamplesPerSec=2.635330674306124, CurrSamplesPerSec=2.5737065131593044, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:49:55,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=32080, skipped=0, lr=[7.4995136066252335e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:49:55,137] [INFO] [timer.py:259:stop] epoch=0/micro_step=32080/global_step=32080, RunningAvgSamplesPerSec=2.6353296396674186, CurrSamplesPerSec=2.617495681727428, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:50:10,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=32090, skipped=0, lr=[7.498054237311763e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:50:10,555] [INFO] [timer.py:259:stop] epoch=0/micro_step=32090/global_step=32090, RunningAvgSamplesPerSec=2.6353230142165707, CurrSamplesPerSec=2.6201936908267203, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:50:25,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=32100, skipped=0, lr=[7.496594584343429e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:50:25,890] [INFO] [timer.py:259:stop] epoch=0/micro_step=32100/global_step=32100, RunningAvgSamplesPerSec=2.635320097880513, CurrSamplesPerSec=2.64275789146978, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:50:41,242] [INFO] [logging.py:96:log_dist] [Rank 0] step=32110, skipped=0, lr=[7.495134647885973e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:50:41,244] [INFO] [timer.py:259:stop] epoch=0/micro_step=32110/global_step=32110, RunningAvgSamplesPerSec=2.6353157047041407, CurrSamplesPerSec=2.598879213079237, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:50:56,612] [INFO] [logging.py:96:log_dist] [Rank 0] step=32120, skipped=0, lr=[7.4936744281051735e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:50:56,614] [INFO] [timer.py:259:stop] epoch=0/micro_step=32120/global_step=32120, RunningAvgSamplesPerSec=2.635310917834159, CurrSamplesPerSec=2.6266217524818316, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:51:11,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=32130, skipped=0, lr=[7.4922139251668355e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:51:11,973] [INFO] [timer.py:259:stop] epoch=0/micro_step=32130/global_step=32130, RunningAvgSamplesPerSec=2.635306723879034, CurrSamplesPerSec=2.632050373157743, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:51:27,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=32140, skipped=0, lr=[7.490753139236804e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:51:27,329] [INFO] [timer.py:259:stop] epoch=0/micro_step=32140/global_step=32140, RunningAvgSamplesPerSec=2.635302346480023, CurrSamplesPerSec=2.626925679888992, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:51:42,707] [INFO] [logging.py:96:log_dist] [Rank 0] step=32150, skipped=0, lr=[7.489292070480949e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:51:42,710] [INFO] [timer.py:259:stop] epoch=0/micro_step=32150/global_step=32150, RunningAvgSamplesPerSec=2.6352971186591128, CurrSamplesPerSec=2.6235633669487406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:51:58,025] [INFO] [logging.py:96:log_dist] [Rank 0] step=32160, skipped=0, lr=[7.4878307190651765e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:51:58,027] [INFO] [timer.py:259:stop] epoch=0/micro_step=32160/global_step=32160, RunningAvgSamplesPerSec=2.6352938393523804, CurrSamplesPerSec=2.6250916689160233, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:52:13,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=32170, skipped=0, lr=[7.4863690851554225e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:52:13,423] [INFO] [timer.py:259:stop] epoch=0/micro_step=32170/global_step=32170, RunningAvgSamplesPerSec=2.635287893246977, CurrSamplesPerSec=2.6234624459310902, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:52:28,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=32180, skipped=0, lr=[7.484907168917657e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:52:28,860] [INFO] [timer.py:259:stop] epoch=0/micro_step=32180/global_step=32180, RunningAvgSamplesPerSec=2.63527968285702, CurrSamplesPerSec=2.6066330234303074, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:52:44,210] [INFO] [logging.py:96:log_dist] [Rank 0] step=32190, skipped=0, lr=[7.4834449705178815e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:52:44,218] [INFO] [timer.py:259:stop] epoch=0/micro_step=32190/global_step=32190, RunningAvgSamplesPerSec=2.6352763038762936, CurrSamplesPerSec=2.619527665275489, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:52:59,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=32200, skipped=0, lr=[7.481982490122126e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:52:59,541] [INFO] [timer.py:259:stop] epoch=0/micro_step=32200/global_step=32200, RunningAvgSamplesPerSec=2.6352739794258775, CurrSamplesPerSec=2.6049437078614224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:53:14,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=32210, skipped=0, lr=[7.480519727896459e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:53:14,913] [INFO] [timer.py:259:stop] epoch=0/micro_step=32210/global_step=32210, RunningAvgSamplesPerSec=2.6352697286788507, CurrSamplesPerSec=2.633560470617297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:53:30,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=32220, skipped=0, lr=[7.479056684006977e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:53:30,297] [INFO] [timer.py:259:stop] epoch=0/micro_step=32220/global_step=32220, RunningAvgSamplesPerSec=2.6352644869837336, CurrSamplesPerSec=2.6067792315363203, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:53:45,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=32230, skipped=0, lr=[7.477593358619808e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:53:45,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=32230/global_step=32230, RunningAvgSamplesPerSec=2.63525895878061, CurrSamplesPerSec=2.629486972017829, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:54:01,106] [INFO] [logging.py:96:log_dist] [Rank 0] step=32240, skipped=0, lr=[7.476129751901113e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:54:01,113] [INFO] [timer.py:259:stop] epoch=0/micro_step=32240/global_step=32240, RunningAvgSamplesPerSec=2.635251168200389, CurrSamplesPerSec=2.6442569651737524, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:54:16,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=32250, skipped=0, lr=[7.474665864017085e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:54:16,463] [INFO] [timer.py:259:stop] epoch=0/micro_step=32250/global_step=32250, RunningAvgSamplesPerSec=2.6352479839805927, CurrSamplesPerSec=2.6223487281165805, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:54:31,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=32260, skipped=0, lr=[7.473201695133951e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:54:31,889] [INFO] [timer.py:259:stop] epoch=0/micro_step=32260/global_step=32260, RunningAvgSamplesPerSec=2.6352409629107383, CurrSamplesPerSec=2.6400880165463017, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:54:47,267] [INFO] [logging.py:96:log_dist] [Rank 0] step=32270, skipped=0, lr=[7.471737245417966e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:54:47,271] [INFO] [timer.py:259:stop] epoch=0/micro_step=32270/global_step=32270, RunningAvgSamplesPerSec=2.6352356847854153, CurrSamplesPerSec=2.63456664833991, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:55:02,676] [INFO] [logging.py:96:log_dist] [Rank 0] step=32280, skipped=0, lr=[7.470272515035419e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:55:02,682] [INFO] [timer.py:259:stop] epoch=0/micro_step=32280/global_step=32280, RunningAvgSamplesPerSec=2.635229433061371, CurrSamplesPerSec=2.629307300612783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:55:18,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=32290, skipped=0, lr=[7.468807504152631e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:55:18,036] [INFO] [timer.py:259:stop] epoch=0/micro_step=32290/global_step=32290, RunningAvgSamplesPerSec=2.6352257937186887, CurrSamplesPerSec=2.6344321986885144, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:55:33,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=32300, skipped=0, lr=[7.467342212935953e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:55:33,346] [INFO] [timer.py:259:stop] epoch=0/micro_step=32300/global_step=32300, RunningAvgSamplesPerSec=2.6352239881952007, CurrSamplesPerSec=2.6276316879573027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:55:48,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=32310, skipped=0, lr=[7.465876641551771e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:55:48,694] [INFO] [timer.py:259:stop] epoch=0/micro_step=32310/global_step=32310, RunningAvgSamplesPerSec=2.6352214131467275, CurrSamplesPerSec=2.622588942158483, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:56:04,080] [INFO] [logging.py:96:log_dist] [Rank 0] step=32320, skipped=0, lr=[7.464410790166502e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:56:04,082] [INFO] [timer.py:259:stop] epoch=0/micro_step=32320/global_step=32320, RunningAvgSamplesPerSec=2.6352172938800433, CurrSamplesPerSec=2.6313783070548125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:56:19,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=32330, skipped=0, lr=[7.462944658946592e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:56:19,372] [INFO] [timer.py:259:stop] epoch=0/micro_step=32330/global_step=32330, RunningAvgSamplesPerSec=2.6352161345580947, CurrSamplesPerSec=2.6134688406909787, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:56:34,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=32340, skipped=0, lr=[7.46147824805852e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:56:34,611] [INFO] [timer.py:259:stop] epoch=0/micro_step=32340/global_step=32340, RunningAvgSamplesPerSec=2.6352167844219263, CurrSamplesPerSec=2.6380211192163534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:56:49,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=32350, skipped=0, lr=[7.460011557668798e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:56:49,900] [INFO] [timer.py:259:stop] epoch=0/micro_step=32350/global_step=32350, RunningAvgSamplesPerSec=2.635214466541542, CurrSamplesPerSec=2.6215537941525606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:57:05,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=32360, skipped=0, lr=[7.458544587943971e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:57:05,159] [INFO] [timer.py:259:stop] epoch=0/micro_step=32360/global_step=32360, RunningAvgSamplesPerSec=2.6352141736502657, CurrSamplesPerSec=2.6420333311510666, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:57:20,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=32370, skipped=0, lr=[7.457077339050614e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:57:20,726] [INFO] [timer.py:259:stop] epoch=0/micro_step=32370/global_step=32370, RunningAvgSamplesPerSec=2.6351999942379165, CurrSamplesPerSec=2.6036278519119658, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:57:36,020] [INFO] [logging.py:96:log_dist] [Rank 0] step=32380, skipped=0, lr=[7.45560981115533e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:57:36,021] [INFO] [timer.py:259:stop] epoch=0/micro_step=32380/global_step=32380, RunningAvgSamplesPerSec=2.6351979651150974, CurrSamplesPerSec=2.6356270059754667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:57:51,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=32390, skipped=0, lr=[7.454142004424761e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:57:51,293] [INFO] [timer.py:259:stop] epoch=0/micro_step=32390/global_step=32390, RunningAvgSamplesPerSec=2.6351973800172144, CurrSamplesPerSec=2.6399260016786013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:58:06,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=32400, skipped=0, lr=[7.452673919025573e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:58:06,494] [INFO] [timer.py:259:stop] epoch=0/micro_step=32400/global_step=32400, RunningAvgSamplesPerSec=2.635200072200195, CurrSamplesPerSec=2.641217688012681, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:58:21,740] [INFO] [logging.py:96:log_dist] [Rank 0] step=32410, skipped=0, lr=[7.451205555124472e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:58:21,751] [INFO] [timer.py:259:stop] epoch=0/micro_step=32410/global_step=32410, RunningAvgSamplesPerSec=2.6352009201629696, CurrSamplesPerSec=2.641846117105916, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:58:36,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=32420, skipped=0, lr=[7.4497369128881895e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:58:36,991] [INFO] [timer.py:259:stop] epoch=0/micro_step=32420/global_step=32420, RunningAvgSamplesPerSec=2.6352012992487928, CurrSamplesPerSec=2.6269133404648954, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:58:52,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=32430, skipped=0, lr=[7.448267992483489e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:58:52,221] [INFO] [timer.py:259:stop] epoch=0/micro_step=32430/global_step=32430, RunningAvgSamplesPerSec=2.6352017966404313, CurrSamplesPerSec=2.6492278359171175, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:59:07,494] [INFO] [logging.py:96:log_dist] [Rank 0] step=32440, skipped=0, lr=[7.446798794077168e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:59:07,500] [INFO] [timer.py:259:stop] epoch=0/micro_step=32440/global_step=32440, RunningAvgSamplesPerSec=2.6351997996267498, CurrSamplesPerSec=2.6385251950715873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:59:22,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=32450, skipped=0, lr=[7.445329317836054e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:59:22,752] [INFO] [timer.py:259:stop] epoch=0/micro_step=32450/global_step=32450, RunningAvgSamplesPerSec=2.6351995065605602, CurrSamplesPerSec=2.637324857211409, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:59:38,042] [INFO] [logging.py:96:log_dist] [Rank 0] step=32460, skipped=0, lr=[7.443859563927008e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:59:38,043] [INFO] [timer.py:259:stop] epoch=0/micro_step=32460/global_step=32460, RunningAvgSamplesPerSec=2.6351974344249514, CurrSamplesPerSec=2.6284616089699933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 13:59:53,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=32470, skipped=0, lr=[7.4423895325169195e-06], mom=[(0.9, 0.95)] +[2024-11-01 13:59:53,306] [INFO] [timer.py:259:stop] epoch=0/micro_step=32470/global_step=32470, RunningAvgSamplesPerSec=2.635196511765172, CurrSamplesPerSec=2.6349903579172826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:00:08,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=32480, skipped=0, lr=[7.440919223772711e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:00:08,572] [INFO] [timer.py:259:stop] epoch=0/micro_step=32480/global_step=32480, RunningAvgSamplesPerSec=2.635196012670878, CurrSamplesPerSec=2.664134760001169, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:00:23,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=32490, skipped=0, lr=[7.439448637861337e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:00:23,850] [INFO] [timer.py:259:stop] epoch=0/micro_step=32490/global_step=32490, RunningAvgSamplesPerSec=2.63519462110131, CurrSamplesPerSec=2.6284677859423815, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:00:39,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=32500, skipped=0, lr=[7.437977774949782e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:00:39,141] [INFO] [timer.py:259:stop] epoch=0/micro_step=32500/global_step=32500, RunningAvgSamplesPerSec=2.635192581276589, CurrSamplesPerSec=2.5993160859054627, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:00:54,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=32510, skipped=0, lr=[7.4365066352050645e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:00:54,397] [INFO] [timer.py:259:stop] epoch=0/micro_step=32510/global_step=32510, RunningAvgSamplesPerSec=2.6351922640950356, CurrSamplesPerSec=2.627241197205537, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:01:09,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=32520, skipped=0, lr=[7.435035218794234e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:01:09,695] [INFO] [timer.py:259:stop] epoch=0/micro_step=32520/global_step=32520, RunningAvgSamplesPerSec=2.6351894933414948, CurrSamplesPerSec=2.632144109892507, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:01:25,055] [INFO] [logging.py:96:log_dist] [Rank 0] step=32530, skipped=0, lr=[7.433563525884367e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:01:25,057] [INFO] [timer.py:259:stop] epoch=0/micro_step=32530/global_step=32530, RunningAvgSamplesPerSec=2.635184596128386, CurrSamplesPerSec=2.621516927470491, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:01:40,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=32540, skipped=0, lr=[7.432091556642576e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:01:40,421] [INFO] [timer.py:259:stop] epoch=0/micro_step=32540/global_step=32540, RunningAvgSamplesPerSec=2.635180373900047, CurrSamplesPerSec=2.6240570071893936, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:01:55,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=32550, skipped=0, lr=[7.430619311236004e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:01:55,829] [INFO] [timer.py:259:stop] epoch=0/micro_step=32550/global_step=32550, RunningAvgSamplesPerSec=2.6351735686297486, CurrSamplesPerSec=2.6363116008562226, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:02:11,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=32560, skipped=0, lr=[7.429146789831824e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:02:11,247] [INFO] [timer.py:259:stop] epoch=0/micro_step=32560/global_step=32560, RunningAvgSamplesPerSec=2.635166489585873, CurrSamplesPerSec=2.6350333985451306, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:02:26,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=32570, skipped=0, lr=[7.427673992597244e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:02:26,652] [INFO] [timer.py:259:stop] epoch=0/micro_step=32570/global_step=32570, RunningAvgSamplesPerSec=2.6351597467781294, CurrSamplesPerSec=2.609319225273544, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:02:42,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=32580, skipped=0, lr=[7.426200919699497e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:02:42,103] [INFO] [timer.py:259:stop] epoch=0/micro_step=32580/global_step=32580, RunningAvgSamplesPerSec=2.6351512605894643, CurrSamplesPerSec=2.6205194629796575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:02:57,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=32590, skipped=0, lr=[7.424727571305854e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:02:57,496] [INFO] [timer.py:259:stop] epoch=0/micro_step=32590/global_step=32590, RunningAvgSamplesPerSec=2.6351449110608964, CurrSamplesPerSec=2.565338376820002, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:03:12,876] [INFO] [logging.py:96:log_dist] [Rank 0] step=32600, skipped=0, lr=[7.423253947583613e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:03:12,889] [INFO] [timer.py:259:stop] epoch=0/micro_step=32600/global_step=32600, RunningAvgSamplesPerSec=2.6351393050250067, CurrSamplesPerSec=2.6103024941562283, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:03:28,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=32610, skipped=0, lr=[7.4217800487001035e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:03:28,317] [INFO] [timer.py:259:stop] epoch=0/micro_step=32610/global_step=32610, RunningAvgSamplesPerSec=2.635130983137524, CurrSamplesPerSec=2.624105847866964, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:03:43,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=32620, skipped=0, lr=[7.420305874822687e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:03:43,720] [INFO] [timer.py:259:stop] epoch=0/micro_step=32620/global_step=32620, RunningAvgSamplesPerSec=2.6351242558321872, CurrSamplesPerSec=2.640130392946205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:03:59,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=32630, skipped=0, lr=[7.4188314261187586e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:03:59,159] [INFO] [timer.py:259:stop] epoch=0/micro_step=32630/global_step=32630, RunningAvgSamplesPerSec=2.635115520988768, CurrSamplesPerSec=2.5953786696101635, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:04:14,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=32640, skipped=0, lr=[7.41735670275574e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:04:14,551] [INFO] [timer.py:259:stop] epoch=0/micro_step=32640/global_step=32640, RunningAvgSamplesPerSec=2.6351103238743776, CurrSamplesPerSec=2.635567798808179, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:04:29,925] [INFO] [logging.py:96:log_dist] [Rank 0] step=32650, skipped=0, lr=[7.415881704901089e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:04:29,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=32650/global_step=32650, RunningAvgSamplesPerSec=2.635106204018014, CurrSamplesPerSec=2.6303576545781286, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:04:45,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=32660, skipped=0, lr=[7.4144064327222906e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:04:45,266] [INFO] [timer.py:259:stop] epoch=0/micro_step=32660/global_step=32660, RunningAvgSamplesPerSec=2.635103121107793, CurrSamplesPerSec=2.6206488120769564, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:05:00,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=32670, skipped=0, lr=[7.412930886386863e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:05:00,670] [INFO] [timer.py:259:stop] epoch=0/micro_step=32670/global_step=32670, RunningAvgSamplesPerSec=2.635096676657292, CurrSamplesPerSec=2.627254773966739, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:05:16,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=32680, skipped=0, lr=[7.411455066062354e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:05:16,005] [INFO] [timer.py:259:stop] epoch=0/micro_step=32680/global_step=32680, RunningAvgSamplesPerSec=2.6350938488007727, CurrSamplesPerSec=2.611700725472185, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:05:31,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=32690, skipped=0, lr=[7.4099789719163425e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:05:31,450] [INFO] [timer.py:259:stop] epoch=0/micro_step=32690/global_step=32690, RunningAvgSamplesPerSec=2.635086101011908, CurrSamplesPerSec=2.614821562100819, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:05:46,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=32700, skipped=0, lr=[7.408502604116443e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:05:46,804] [INFO] [timer.py:259:stop] epoch=0/micro_step=32700/global_step=32700, RunningAvgSamplesPerSec=2.6350826805061454, CurrSamplesPerSec=2.638122748638295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:06:02,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=32710, skipped=0, lr=[7.407025962830294e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:06:02,265] [INFO] [timer.py:259:stop] epoch=0/micro_step=32710/global_step=32710, RunningAvgSamplesPerSec=2.6350739623484687, CurrSamplesPerSec=2.637558700532346, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:06:17,629] [INFO] [logging.py:96:log_dist] [Rank 0] step=32720, skipped=0, lr=[7.405549048225572e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:06:17,632] [INFO] [timer.py:259:stop] epoch=0/micro_step=32720/global_step=32720, RunningAvgSamplesPerSec=2.635069956937534, CurrSamplesPerSec=2.6338858537836827, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:06:33,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=32730, skipped=0, lr=[7.404071860469979e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:06:33,005] [INFO] [timer.py:259:stop] epoch=0/micro_step=32730/global_step=32730, RunningAvgSamplesPerSec=2.6350652843450892, CurrSamplesPerSec=2.640794883198309, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:06:48,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=32740, skipped=0, lr=[7.402594399731249e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:06:48,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=32740/global_step=32740, RunningAvgSamplesPerSec=2.6350620587361466, CurrSamplesPerSec=2.584031224515406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:07:03,750] [INFO] [logging.py:96:log_dist] [Rank 0] step=32750, skipped=0, lr=[7.4011166661771504e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:07:03,756] [INFO] [timer.py:259:stop] epoch=0/micro_step=32750/global_step=32750, RunningAvgSamplesPerSec=2.6350550655065126, CurrSamplesPerSec=2.565077553206317, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:07:19,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=32760, skipped=0, lr=[7.3996386599754796e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:07:19,136] [INFO] [timer.py:259:stop] epoch=0/micro_step=32760/global_step=32760, RunningAvgSamplesPerSec=2.635049910809781, CurrSamplesPerSec=2.54843715302783, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:07:34,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=32770, skipped=0, lr=[7.398160381294065e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:07:34,500] [INFO] [timer.py:259:stop] epoch=0/micro_step=32770/global_step=32770, RunningAvgSamplesPerSec=2.635045932185071, CurrSamplesPerSec=2.639663496725334, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:07:49,926] [INFO] [logging.py:96:log_dist] [Rank 0] step=32780, skipped=0, lr=[7.396681830300765e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:07:49,934] [INFO] [timer.py:259:stop] epoch=0/micro_step=32780/global_step=32780, RunningAvgSamplesPerSec=2.6350376699670353, CurrSamplesPerSec=2.6037809968603463, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:08:05,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=32790, skipped=0, lr=[7.395203007163468e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:08:05,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=32790/global_step=32790, RunningAvgSamplesPerSec=2.6350327720495628, CurrSamplesPerSec=2.6302640450375274, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:08:20,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=32800, skipped=0, lr=[7.393723912050099e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:08:20,682] [INFO] [timer.py:259:stop] epoch=0/micro_step=32800/global_step=32800, RunningAvgSamplesPerSec=2.6350283317768572, CurrSamplesPerSec=2.6230375141505435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:08:36,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=32810, skipped=0, lr=[7.392244545128606e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:08:36,056] [INFO] [timer.py:259:stop] epoch=0/micro_step=32810/global_step=32810, RunningAvgSamplesPerSec=2.6350247836482086, CurrSamplesPerSec=2.6045659959426004, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:08:51,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=32820, skipped=0, lr=[7.3907649065669725e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:08:51,428] [INFO] [timer.py:259:stop] epoch=0/micro_step=32820/global_step=32820, RunningAvgSamplesPerSec=2.63502084869186, CurrSamplesPerSec=2.6355636585463444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:09:06,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=32830, skipped=0, lr=[7.389284996533214e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:09:06,826] [INFO] [timer.py:259:stop] epoch=0/micro_step=32830/global_step=32830, RunningAvgSamplesPerSec=2.6350155212357094, CurrSamplesPerSec=2.6260954951918105, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:09:22,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=32840, skipped=0, lr=[7.387804815195371e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:09:22,338] [INFO] [timer.py:259:stop] epoch=0/micro_step=32840/global_step=32840, RunningAvgSamplesPerSec=2.6350035109600634, CurrSamplesPerSec=2.6042239659707205, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:09:37,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=32850, skipped=0, lr=[7.386324362721521e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:09:37,746] [INFO] [timer.py:259:stop] epoch=0/micro_step=32850/global_step=32850, RunningAvgSamplesPerSec=2.63499745319709, CurrSamplesPerSec=2.621204421585482, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:09:53,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=32860, skipped=0, lr=[7.38484363927977e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:09:53,222] [INFO] [timer.py:259:stop] epoch=0/micro_step=32860/global_step=32860, RunningAvgSamplesPerSec=2.6349872560757586, CurrSamplesPerSec=2.6248702976792173, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:10:08,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=32870, skipped=0, lr=[7.383362645038256e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:10:08,645] [INFO] [timer.py:259:stop] epoch=0/micro_step=32870/global_step=32870, RunningAvgSamplesPerSec=2.634980182297016, CurrSamplesPerSec=2.5581660825625945, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:10:24,025] [INFO] [logging.py:96:log_dist] [Rank 0] step=32880, skipped=0, lr=[7.381881380165142e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:10:24,028] [INFO] [timer.py:259:stop] epoch=0/micro_step=32880/global_step=32880, RunningAvgSamplesPerSec=2.63497497024424, CurrSamplesPerSec=2.613440750209711, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:10:39,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=32890, skipped=0, lr=[7.38039984482863e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:10:39,530] [INFO] [timer.py:259:stop] epoch=0/micro_step=32890/global_step=32890, RunningAvgSamplesPerSec=2.6349639277600962, CurrSamplesPerSec=2.569817901809633, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:10:54,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=32900, skipped=0, lr=[7.378918039196946e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:10:54,921] [INFO] [timer.py:259:stop] epoch=0/micro_step=32900/global_step=32900, RunningAvgSamplesPerSec=2.634959162674032, CurrSamplesPerSec=2.6378382059876424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:11:10,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=32910, skipped=0, lr=[7.377435963438352e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:11:10,331] [INFO] [timer.py:259:stop] epoch=0/micro_step=32910/global_step=32910, RunningAvgSamplesPerSec=2.6349529890418264, CurrSamplesPerSec=2.6255386322378174, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:11:25,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=32920, skipped=0, lr=[7.375953617721137e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:11:25,704] [INFO] [timer.py:259:stop] epoch=0/micro_step=32920/global_step=32920, RunningAvgSamplesPerSec=2.6349486797742006, CurrSamplesPerSec=2.626948713790804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:11:41,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=32930, skipped=0, lr=[7.374471002213622e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:11:41,073] [INFO] [timer.py:259:stop] epoch=0/micro_step=32930/global_step=32930, RunningAvgSamplesPerSec=2.634944555194534, CurrSamplesPerSec=2.61816189999625, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:11:56,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=32940, skipped=0, lr=[7.372988117084157e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:11:56,442] [INFO] [timer.py:259:stop] epoch=0/micro_step=32940/global_step=32940, RunningAvgSamplesPerSec=2.6349397204954212, CurrSamplesPerSec=2.6119467187287433, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:12:11,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=32950, skipped=0, lr=[7.371504962501128e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:12:11,931] [INFO] [timer.py:259:stop] epoch=0/micro_step=32950/global_step=32950, RunningAvgSamplesPerSec=2.634930343757516, CurrSamplesPerSec=2.6171265688178047, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:12:27,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=32960, skipped=0, lr=[7.370021538632943e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:12:27,318] [INFO] [timer.py:259:stop] epoch=0/micro_step=32960/global_step=32960, RunningAvgSamplesPerSec=2.634924831155708, CurrSamplesPerSec=2.605971849225138, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:12:42,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=32970, skipped=0, lr=[7.3685378456480495e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:12:42,733] [INFO] [timer.py:259:stop] epoch=0/micro_step=32970/global_step=32970, RunningAvgSamplesPerSec=2.6349187397095166, CurrSamplesPerSec=2.6238493520027024, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:12:58,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=32980, skipped=0, lr=[7.3670538837149185e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:12:58,104] [INFO] [timer.py:259:stop] epoch=0/micro_step=32980/global_step=32980, RunningAvgSamplesPerSec=2.6349143035941585, CurrSamplesPerSec=2.623347996124211, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:13:13,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=32990, skipped=0, lr=[7.365569653002053e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:13:13,499] [INFO] [timer.py:259:stop] epoch=0/micro_step=32990/global_step=32990, RunningAvgSamplesPerSec=2.6349084024949203, CurrSamplesPerSec=2.6432945968735706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:13:28,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=33000, skipped=0, lr=[7.364085153677993e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:13:28,891] [INFO] [timer.py:259:stop] epoch=0/micro_step=33000/global_step=33000, RunningAvgSamplesPerSec=2.6349025140907347, CurrSamplesPerSec=2.5542919765768257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:13:44,221] [INFO] [logging.py:96:log_dist] [Rank 0] step=33010, skipped=0, lr=[7.3626003859113e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:13:44,227] [INFO] [timer.py:259:stop] epoch=0/micro_step=33010/global_step=33010, RunningAvgSamplesPerSec=2.634900738725118, CurrSamplesPerSec=2.651219745298029, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:13:59,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=33020, skipped=0, lr=[7.361115349870571e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:13:59,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=33020/global_step=33020, RunningAvgSamplesPerSec=2.634894285526758, CurrSamplesPerSec=2.607960027430552, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:14:14,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=33030, skipped=0, lr=[7.359630045724431e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:14:14,966] [INFO] [timer.py:259:stop] epoch=0/micro_step=33030/global_step=33030, RunningAvgSamplesPerSec=2.634891408875941, CurrSamplesPerSec=2.6283916053118928, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:14:30,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=33040, skipped=0, lr=[7.3581444736415374e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:14:30,365] [INFO] [timer.py:259:stop] epoch=0/micro_step=33040/global_step=33040, RunningAvgSamplesPerSec=2.634886488091648, CurrSamplesPerSec=2.6263614754472657, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:14:45,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=33050, skipped=0, lr=[7.356658633790579e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:14:45,659] [INFO] [timer.py:259:stop] epoch=0/micro_step=33050/global_step=33050, RunningAvgSamplesPerSec=2.6348867079317477, CurrSamplesPerSec=2.6368025910427364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:15:01,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=33060, skipped=0, lr=[7.355172526340272e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:15:01,005] [INFO] [timer.py:259:stop] epoch=0/micro_step=33060/global_step=33060, RunningAvgSamplesPerSec=2.634883714440003, CurrSamplesPerSec=2.6219540692032326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:15:16,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=33070, skipped=0, lr=[7.353686151459365e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:15:16,402] [INFO] [timer.py:259:stop] epoch=0/micro_step=33070/global_step=33070, RunningAvgSamplesPerSec=2.6348777157941634, CurrSamplesPerSec=2.625834911462713, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:15:31,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=33080, skipped=0, lr=[7.352199509316636e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:15:31,827] [INFO] [timer.py:259:stop] epoch=0/micro_step=33080/global_step=33080, RunningAvgSamplesPerSec=2.6348714009325866, CurrSamplesPerSec=2.644586247626108, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:15:47,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=33090, skipped=0, lr=[7.350712600080891e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:15:47,166] [INFO] [timer.py:259:stop] epoch=0/micro_step=33090/global_step=33090, RunningAvgSamplesPerSec=2.634869611865534, CurrSamplesPerSec=2.6264457617301042, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:16:02,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=33100, skipped=0, lr=[7.349225423920975e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:16:02,527] [INFO] [timer.py:259:stop] epoch=0/micro_step=33100/global_step=33100, RunningAvgSamplesPerSec=2.634865849542502, CurrSamplesPerSec=2.6514988013810274, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:16:17,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=33110, skipped=0, lr=[7.347737981005753e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:16:17,947] [INFO] [timer.py:259:stop] epoch=0/micro_step=33110/global_step=33110, RunningAvgSamplesPerSec=2.634858332203136, CurrSamplesPerSec=2.6081667969758215, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:16:33,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=33120, skipped=0, lr=[7.346250271504125e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:16:33,295] [INFO] [timer.py:259:stop] epoch=0/micro_step=33120/global_step=33120, RunningAvgSamplesPerSec=2.6348545247364066, CurrSamplesPerSec=2.621706187439662, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:16:48,656] [INFO] [logging.py:96:log_dist] [Rank 0] step=33130, skipped=0, lr=[7.344762295585022e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:16:48,658] [INFO] [timer.py:259:stop] epoch=0/micro_step=33130/global_step=33130, RunningAvgSamplesPerSec=2.634849919431615, CurrSamplesPerSec=2.5771258402592885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:17:04,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=33140, skipped=0, lr=[7.343274053417404e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:17:04,046] [INFO] [timer.py:259:stop] epoch=0/micro_step=33140/global_step=33140, RunningAvgSamplesPerSec=2.634844728764267, CurrSamplesPerSec=2.6331785477356755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:17:19,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=33150, skipped=0, lr=[7.3417855451702605e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:17:19,424] [INFO] [timer.py:259:stop] epoch=0/micro_step=33150/global_step=33150, RunningAvgSamplesPerSec=2.634839778133221, CurrSamplesPerSec=2.584520848870596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:17:34,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=33160, skipped=0, lr=[7.3402967710126116e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:17:34,835] [INFO] [timer.py:259:stop] epoch=0/micro_step=33160/global_step=33160, RunningAvgSamplesPerSec=2.6348340608938554, CurrSamplesPerSec=2.6369356249298197, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:17:50,228] [INFO] [logging.py:96:log_dist] [Rank 0] step=33170, skipped=0, lr=[7.338807731113511e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:17:50,231] [INFO] [timer.py:259:stop] epoch=0/micro_step=33170/global_step=33170, RunningAvgSamplesPerSec=2.6348282841118826, CurrSamplesPerSec=2.6008904481616937, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:18:05,612] [INFO] [logging.py:96:log_dist] [Rank 0] step=33180, skipped=0, lr=[7.337318425642036e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:18:05,615] [INFO] [timer.py:259:stop] epoch=0/micro_step=33180/global_step=33180, RunningAvgSamplesPerSec=2.6348229285215226, CurrSamplesPerSec=2.6331554044545653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:18:21,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=33190, skipped=0, lr=[7.335828854767301e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:18:21,021] [INFO] [timer.py:259:stop] epoch=0/micro_step=33190/global_step=33190, RunningAvgSamplesPerSec=2.6348167597572205, CurrSamplesPerSec=2.6151199112818664, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:18:36,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=33200, skipped=0, lr=[7.334339018658445e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:18:36,363] [INFO] [timer.py:259:stop] epoch=0/micro_step=33200/global_step=33200, RunningAvgSamplesPerSec=2.6348134586325336, CurrSamplesPerSec=2.6224343966767565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:18:51,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=33210, skipped=0, lr=[7.332848917484641e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:18:51,789] [INFO] [timer.py:259:stop] epoch=0/micro_step=33210/global_step=33210, RunningAvgSamplesPerSec=2.6348071664043897, CurrSamplesPerSec=2.636670813697926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:19:07,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=33220, skipped=0, lr=[7.331358551415088e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:19:07,305] [INFO] [timer.py:259:stop] epoch=0/micro_step=33220/global_step=33220, RunningAvgSamplesPerSec=2.6347961563992093, CurrSamplesPerSec=2.459067325603826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:19:22,735] [INFO] [logging.py:96:log_dist] [Rank 0] step=33230, skipped=0, lr=[7.32986792061902e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:19:22,738] [INFO] [timer.py:259:stop] epoch=0/micro_step=33230/global_step=33230, RunningAvgSamplesPerSec=2.634788792206297, CurrSamplesPerSec=2.6168673537367764, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:19:38,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=33240, skipped=0, lr=[7.328377025265697e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:19:38,103] [INFO] [timer.py:259:stop] epoch=0/micro_step=33240/global_step=33240, RunningAvgSamplesPerSec=2.634784269901807, CurrSamplesPerSec=2.5997539113354473, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:19:53,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=33250, skipped=0, lr=[7.3268858655244125e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:19:53,471] [INFO] [timer.py:259:stop] epoch=0/micro_step=33250/global_step=33250, RunningAvgSamplesPerSec=2.6347800897148623, CurrSamplesPerSec=2.6181002064386396, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:20:08,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=33260, skipped=0, lr=[7.325394441564486e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:20:08,873] [INFO] [timer.py:259:stop] epoch=0/micro_step=33260/global_step=33260, RunningAvgSamplesPerSec=2.6347741667308013, CurrSamplesPerSec=2.5977026018274874, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:20:24,252] [INFO] [logging.py:96:log_dist] [Rank 0] step=33270, skipped=0, lr=[7.323902753555272e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:20:24,254] [INFO] [timer.py:259:stop] epoch=0/micro_step=33270/global_step=33270, RunningAvgSamplesPerSec=2.6347698219537, CurrSamplesPerSec=2.617749303031186, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:20:39,639] [INFO] [logging.py:96:log_dist] [Rank 0] step=33280, skipped=0, lr=[7.322410801666149e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:20:39,641] [INFO] [timer.py:259:stop] epoch=0/micro_step=33280/global_step=33280, RunningAvgSamplesPerSec=2.6347650274947907, CurrSamplesPerSec=2.6050763778682127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:20:54,985] [INFO] [logging.py:96:log_dist] [Rank 0] step=33290, skipped=0, lr=[7.32091858606653e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:20:55,015] [INFO] [timer.py:259:stop] epoch=0/micro_step=33290/global_step=33290, RunningAvgSamplesPerSec=2.6347604689734316, CurrSamplesPerSec=2.581097766838709, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:21:10,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=33300, skipped=0, lr=[7.319426106925856e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:21:10,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=33300/global_step=33300, RunningAvgSamplesPerSec=2.634756131151191, CurrSamplesPerSec=2.5926862208820207, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:21:25,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=33310, skipped=0, lr=[7.3179333644136004e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:21:25,765] [INFO] [timer.py:259:stop] epoch=0/micro_step=33310/global_step=33310, RunningAvgSamplesPerSec=2.6347519831806725, CurrSamplesPerSec=2.6278263594684375, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:21:41,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=33320, skipped=0, lr=[7.316440358699263e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:21:41,105] [INFO] [timer.py:259:stop] epoch=0/micro_step=33320/global_step=33320, RunningAvgSamplesPerSec=2.634749328043322, CurrSamplesPerSec=2.624251559850519, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:21:56,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=33330, skipped=0, lr=[7.3149470899523734e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:21:56,590] [INFO] [timer.py:259:stop] epoch=0/micro_step=33330/global_step=33330, RunningAvgSamplesPerSec=2.634739326675968, CurrSamplesPerSec=2.616013735836693, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:22:11,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=33340, skipped=0, lr=[7.3134535583424984e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:22:11,940] [INFO] [timer.py:259:stop] epoch=0/micro_step=33340/global_step=33340, RunningAvgSamplesPerSec=2.634735974775523, CurrSamplesPerSec=2.636673299941008, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:22:27,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=33350, skipped=0, lr=[7.3119597640392225e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:22:27,416] [INFO] [timer.py:259:stop] epoch=0/micro_step=33350/global_step=33350, RunningAvgSamplesPerSec=2.6347276192068274, CurrSamplesPerSec=2.623064580988609, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:22:42,840] [INFO] [logging.py:96:log_dist] [Rank 0] step=33360, skipped=0, lr=[7.31046570721217e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:22:42,842] [INFO] [timer.py:259:stop] epoch=0/micro_step=33360/global_step=33360, RunningAvgSamplesPerSec=2.634720656382333, CurrSamplesPerSec=2.6096784263125756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:22:58,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=33370, skipped=0, lr=[7.308971388030991e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:22:58,330] [INFO] [timer.py:259:stop] epoch=0/micro_step=33370/global_step=33370, RunningAvgSamplesPerSec=2.634710729901985, CurrSamplesPerSec=2.552283410733163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:23:13,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=33380, skipped=0, lr=[7.3074768066653646e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:23:13,685] [INFO] [timer.py:259:stop] epoch=0/micro_step=33380/global_step=33380, RunningAvgSamplesPerSec=2.634706902475984, CurrSamplesPerSec=2.637833229111427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:23:29,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=33390, skipped=0, lr=[7.305981963285003e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:23:29,096] [INFO] [timer.py:259:stop] epoch=0/micro_step=33390/global_step=33390, RunningAvgSamplesPerSec=2.634700274384657, CurrSamplesPerSec=2.5956167787692936, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:23:44,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=33400, skipped=0, lr=[7.304486858059645e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:23:44,438] [INFO] [timer.py:259:stop] epoch=0/micro_step=33400/global_step=33400, RunningAvgSamplesPerSec=2.6346965722709013, CurrSamplesPerSec=2.6358394284033, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:23:59,784] [INFO] [logging.py:96:log_dist] [Rank 0] step=33410, skipped=0, lr=[7.302991491159062e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:23:59,794] [INFO] [timer.py:259:stop] epoch=0/micro_step=33410/global_step=33410, RunningAvgSamplesPerSec=2.634692574751937, CurrSamplesPerSec=2.6158460971944435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:24:15,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=33420, skipped=0, lr=[7.301495862753051e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:24:15,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=33420/global_step=33420, RunningAvgSamplesPerSec=2.63468487021186, CurrSamplesPerSec=2.6272062274062025, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:24:30,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=33430, skipped=0, lr=[7.299999973011441e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:24:30,690] [INFO] [timer.py:259:stop] epoch=0/micro_step=33430/global_step=33430, RunningAvgSamplesPerSec=2.6346776728556796, CurrSamplesPerSec=2.614015707588976, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:24:46,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=33440, skipped=0, lr=[7.298503822104093e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:24:46,093] [INFO] [timer.py:259:stop] epoch=0/micro_step=33440/global_step=33440, RunningAvgSamplesPerSec=2.6346720185630477, CurrSamplesPerSec=2.6333203092099153, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:25:01,444] [INFO] [logging.py:96:log_dist] [Rank 0] step=33450, skipped=0, lr=[7.2970074102008935e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:25:01,453] [INFO] [timer.py:259:stop] epoch=0/micro_step=33450/global_step=33450, RunningAvgSamplesPerSec=2.6346678145546316, CurrSamplesPerSec=2.6149303783002327, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:25:16,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=33460, skipped=0, lr=[7.295510737471763e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:25:16,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=33460/global_step=33460, RunningAvgSamplesPerSec=2.6346615376083964, CurrSamplesPerSec=2.6171910744072213, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:25:32,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=33470, skipped=0, lr=[7.294013804086646e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:25:32,295] [INFO] [timer.py:259:stop] epoch=0/micro_step=33470/global_step=33470, RunningAvgSamplesPerSec=2.6346543936208957, CurrSamplesPerSec=2.6060973372114704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:25:47,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=33480, skipped=0, lr=[7.292516610215521e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:25:47,650] [INFO] [timer.py:259:stop] epoch=0/micro_step=33480/global_step=33480, RunningAvgSamplesPerSec=2.6346520629230303, CurrSamplesPerSec=2.626940487350929, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:26:03,144] [INFO] [logging.py:96:log_dist] [Rank 0] step=33490, skipped=0, lr=[7.291019156028396e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:26:03,147] [INFO] [timer.py:259:stop] epoch=0/micro_step=33490/global_step=33490, RunningAvgSamplesPerSec=2.6346413882092174, CurrSamplesPerSec=2.6035672454305816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:26:18,591] [INFO] [logging.py:96:log_dist] [Rank 0] step=33500, skipped=0, lr=[7.289521441695306e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:26:18,595] [INFO] [timer.py:259:stop] epoch=0/micro_step=33500/global_step=33500, RunningAvgSamplesPerSec=2.6346335230086133, CurrSamplesPerSec=2.631060970102757, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:26:33,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=33510, skipped=0, lr=[7.288023467386318e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:26:33,962] [INFO] [timer.py:259:stop] epoch=0/micro_step=33510/global_step=33510, RunningAvgSamplesPerSec=2.6346291919102374, CurrSamplesPerSec=2.6411257985605356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:26:49,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=33520, skipped=0, lr=[7.2865252332715266e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:26:49,305] [INFO] [timer.py:259:stop] epoch=0/micro_step=33520/global_step=33520, RunningAvgSamplesPerSec=2.634626126981761, CurrSamplesPerSec=2.6385841202136744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:27:04,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=33530, skipped=0, lr=[7.285026739521055e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:27:04,664] [INFO] [timer.py:259:stop] epoch=0/micro_step=33530/global_step=33530, RunningAvgSamplesPerSec=2.634622516555318, CurrSamplesPerSec=2.6290510228932873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:27:20,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=33540, skipped=0, lr=[7.283527986305063e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:27:20,079] [INFO] [timer.py:259:stop] epoch=0/micro_step=33540/global_step=33540, RunningAvgSamplesPerSec=2.6346164757840698, CurrSamplesPerSec=2.598464621733326, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:27:35,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=33550, skipped=0, lr=[7.28202897379373e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:27:35,537] [INFO] [timer.py:259:stop] epoch=0/micro_step=33550/global_step=33550, RunningAvgSamplesPerSec=2.6346075271098273, CurrSamplesPerSec=2.6395339249190704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:27:50,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=33560, skipped=0, lr=[7.280529702157271e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:27:50,921] [INFO] [timer.py:259:stop] epoch=0/micro_step=33560/global_step=33560, RunningAvgSamplesPerSec=2.634602356428774, CurrSamplesPerSec=2.605238997955005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:28:06,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=33570, skipped=0, lr=[7.279030171565926e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:28:06,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=33570/global_step=33570, RunningAvgSamplesPerSec=2.634599884206531, CurrSamplesPerSec=2.6228152591228313, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:28:21,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=33580, skipped=0, lr=[7.277530382189971e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:28:21,543] [INFO] [timer.py:259:stop] epoch=0/micro_step=33580/global_step=33580, RunningAvgSamplesPerSec=2.6346001034334985, CurrSamplesPerSec=2.6287787310633886, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:28:36,901] [INFO] [logging.py:96:log_dist] [Rank 0] step=33590, skipped=0, lr=[7.276030334199706e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:28:36,905] [INFO] [timer.py:259:stop] epoch=0/micro_step=33590/global_step=33590, RunningAvgSamplesPerSec=2.634595747615731, CurrSamplesPerSec=2.6271140761587453, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:28:52,320] [INFO] [logging.py:96:log_dist] [Rank 0] step=33600, skipped=0, lr=[7.274530027765463e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:28:52,322] [INFO] [timer.py:259:stop] epoch=0/micro_step=33600/global_step=33600, RunningAvgSamplesPerSec=2.6345894757146424, CurrSamplesPerSec=2.618918393124258, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:29:07,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=33610, skipped=0, lr=[7.273029463057601e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:29:07,663] [INFO] [timer.py:259:stop] epoch=0/micro_step=33610/global_step=33610, RunningAvgSamplesPerSec=2.6345862484180396, CurrSamplesPerSec=2.622819769459281, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:29:23,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=33620, skipped=0, lr=[7.27152864024651e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:29:23,106] [INFO] [timer.py:259:stop] epoch=0/micro_step=33620/global_step=33620, RunningAvgSamplesPerSec=2.6345787613894798, CurrSamplesPerSec=2.626278839027082, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:29:38,479] [INFO] [logging.py:96:log_dist] [Rank 0] step=33630, skipped=0, lr=[7.2700275595026086e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:29:38,500] [INFO] [timer.py:259:stop] epoch=0/micro_step=33630/global_step=33630, RunningAvgSamplesPerSec=2.63457418471054, CurrSamplesPerSec=2.6405621281061804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:29:53,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=33640, skipped=0, lr=[7.268526220996346e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:29:53,823] [INFO] [timer.py:259:stop] epoch=0/micro_step=33640/global_step=33640, RunningAvgSamplesPerSec=2.6345724574826126, CurrSamplesPerSec=2.626805169809531, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:30:09,147] [INFO] [logging.py:96:log_dist] [Rank 0] step=33650, skipped=0, lr=[7.2670246248981976e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:30:09,149] [INFO] [timer.py:259:stop] epoch=0/micro_step=33650/global_step=33650, RunningAvgSamplesPerSec=2.634569938996655, CurrSamplesPerSec=2.612979989675231, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:30:24,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=33660, skipped=0, lr=[7.265522771378673e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:30:24,517] [INFO] [timer.py:259:stop] epoch=0/micro_step=33660/global_step=33660, RunningAvgSamplesPerSec=2.6345654837789336, CurrSamplesPerSec=2.6307932121210875, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:30:39,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=33670, skipped=0, lr=[7.264020660608304e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:30:39,934] [INFO] [timer.py:259:stop] epoch=0/micro_step=33670/global_step=33670, RunningAvgSamplesPerSec=2.6345590799229375, CurrSamplesPerSec=2.6038513121850317, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:30:55,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=33680, skipped=0, lr=[7.262518292757661e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:30:55,295] [INFO] [timer.py:259:stop] epoch=0/micro_step=33680/global_step=33680, RunningAvgSamplesPerSec=2.634555031569643, CurrSamplesPerSec=2.609269716049168, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:31:10,759] [INFO] [logging.py:96:log_dist] [Rank 0] step=33690, skipped=0, lr=[7.2610156679973345e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:31:10,763] [INFO] [timer.py:259:stop] epoch=0/micro_step=33690/global_step=33690, RunningAvgSamplesPerSec=2.6345463845819324, CurrSamplesPerSec=2.561059346659045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:31:26,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=33700, skipped=0, lr=[7.259512786497949e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:31:26,158] [INFO] [timer.py:259:stop] epoch=0/micro_step=33700/global_step=33700, RunningAvgSamplesPerSec=2.6345415369795053, CurrSamplesPerSec=2.6039733627419395, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:31:41,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=33710, skipped=0, lr=[7.258009648430157e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:31:41,665] [INFO] [timer.py:259:stop] epoch=0/micro_step=33710/global_step=33710, RunningAvgSamplesPerSec=2.6345305643811607, CurrSamplesPerSec=2.6141598937923773, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:31:57,048] [INFO] [logging.py:96:log_dist] [Rank 0] step=33720, skipped=0, lr=[7.256506253964641e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:31:57,050] [INFO] [timer.py:259:stop] epoch=0/micro_step=33720/global_step=33720, RunningAvgSamplesPerSec=2.6345253461878144, CurrSamplesPerSec=2.6205648974418634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:32:12,411] [INFO] [logging.py:96:log_dist] [Rank 0] step=33730, skipped=0, lr=[7.255002603272113e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:32:12,412] [INFO] [timer.py:259:stop] epoch=0/micro_step=33730/global_step=33730, RunningAvgSamplesPerSec=2.634521120338977, CurrSamplesPerSec=2.6234784450867283, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:32:27,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=33740, skipped=0, lr=[7.253498696523311e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:32:27,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=33740/global_step=33740, RunningAvgSamplesPerSec=2.6345144884274125, CurrSamplesPerSec=2.6263269401913925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:32:43,212] [INFO] [logging.py:96:log_dist] [Rank 0] step=33750, skipped=0, lr=[7.251994533889005e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:32:43,216] [INFO] [timer.py:259:stop] epoch=0/micro_step=33750/global_step=33750, RunningAvgSamplesPerSec=2.634509315959991, CurrSamplesPerSec=2.6082649226226793, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:32:58,649] [INFO] [logging.py:96:log_dist] [Rank 0] step=33760, skipped=0, lr=[7.250490115539993e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:32:58,652] [INFO] [timer.py:259:stop] epoch=0/micro_step=33760/global_step=33760, RunningAvgSamplesPerSec=2.634502137807676, CurrSamplesPerSec=2.5986268198752693, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:33:14,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=33770, skipped=0, lr=[7.2489854416471005e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:33:14,101] [INFO] [timer.py:259:stop] epoch=0/micro_step=33770/global_step=33770, RunningAvgSamplesPerSec=2.634494392108763, CurrSamplesPerSec=2.624355004657099, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:33:29,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=33780, skipped=0, lr=[7.247480512381186e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:33:29,548] [INFO] [timer.py:259:stop] epoch=0/micro_step=33780/global_step=33780, RunningAvgSamplesPerSec=2.6344877714684443, CurrSamplesPerSec=2.5918398947883077, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:33:44,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=33790, skipped=0, lr=[7.245975327913135e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:33:44,894] [INFO] [timer.py:259:stop] epoch=0/micro_step=33790/global_step=33790, RunningAvgSamplesPerSec=2.6344850776603765, CurrSamplesPerSec=2.62280008810498, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:34:00,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=33800, skipped=0, lr=[7.244469888413861e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:34:00,384] [INFO] [timer.py:259:stop] epoch=0/micro_step=33800/global_step=33800, RunningAvgSamplesPerSec=2.634474468050733, CurrSamplesPerSec=2.5407623804079416, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:34:15,820] [INFO] [logging.py:96:log_dist] [Rank 0] step=33810, skipped=0, lr=[7.242964194054306e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:34:15,822] [INFO] [timer.py:259:stop] epoch=0/micro_step=33810/global_step=33810, RunningAvgSamplesPerSec=2.6344675054567874, CurrSamplesPerSec=2.6222536384705677, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:34:31,213] [INFO] [logging.py:96:log_dist] [Rank 0] step=33820, skipped=0, lr=[7.241458245005443e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:34:31,231] [INFO] [timer.py:259:stop] epoch=0/micro_step=33820/global_step=33820, RunningAvgSamplesPerSec=2.634461797241584, CurrSamplesPerSec=2.610414996162446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:34:46,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=33830, skipped=0, lr=[7.239952041438273e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:34:46,745] [INFO] [timer.py:259:stop] epoch=0/micro_step=33830/global_step=33830, RunningAvgSamplesPerSec=2.634450496747208, CurrSamplesPerSec=2.5982597899873165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:35:02,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=33840, skipped=0, lr=[7.238445583523825e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:35:02,113] [INFO] [timer.py:259:stop] epoch=0/micro_step=33840/global_step=33840, RunningAvgSamplesPerSec=2.634446629982687, CurrSamplesPerSec=2.6396065998464557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:35:17,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=33850, skipped=0, lr=[7.23693887143316e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:35:17,473] [INFO] [timer.py:259:stop] epoch=0/micro_step=33850/global_step=33850, RunningAvgSamplesPerSec=2.6344426217405332, CurrSamplesPerSec=2.632114377694172, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:35:32,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=33860, skipped=0, lr=[7.235431905337364e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:35:32,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=33860/global_step=33860, RunningAvgSamplesPerSec=2.634434765048079, CurrSamplesPerSec=2.5892023295742974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:35:48,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=33870, skipped=0, lr=[7.233924685407553e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:35:48,301] [INFO] [timer.py:259:stop] epoch=0/micro_step=33870/global_step=33870, RunningAvgSamplesPerSec=2.6344313805053505, CurrSamplesPerSec=2.6183490412870913, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:36:03,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=33880, skipped=0, lr=[7.232417211814873e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:36:03,709] [INFO] [timer.py:259:stop] epoch=0/micro_step=33880/global_step=33880, RunningAvgSamplesPerSec=2.6344254017661175, CurrSamplesPerSec=2.643969014453193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:36:18,965] [INFO] [logging.py:96:log_dist] [Rank 0] step=33890, skipped=0, lr=[7.2309094847305e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:36:18,975] [INFO] [timer.py:259:stop] epoch=0/micro_step=33890/global_step=33890, RunningAvgSamplesPerSec=2.6344258203810407, CurrSamplesPerSec=2.610651403814361, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:36:34,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=33900, skipped=0, lr=[7.2294015043256335e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:36:34,245] [INFO] [timer.py:259:stop] epoch=0/micro_step=33900/global_step=33900, RunningAvgSamplesPerSec=2.6344248879883607, CurrSamplesPerSec=2.532661051502591, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:36:49,488] [INFO] [logging.py:96:log_dist] [Rank 0] step=33910, skipped=0, lr=[7.227893270771507e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:36:49,496] [INFO] [timer.py:259:stop] epoch=0/micro_step=33910/global_step=33910, RunningAvgSamplesPerSec=2.634424496446859, CurrSamplesPerSec=2.6103386399528943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:37:04,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=33920, skipped=0, lr=[7.22638478423938e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:37:04,855] [INFO] [timer.py:259:stop] epoch=0/micro_step=33920/global_step=33920, RunningAvgSamplesPerSec=2.634420305432275, CurrSamplesPerSec=2.6327037786440557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:37:20,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=33930, skipped=0, lr=[7.224876044900542e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:37:20,188] [INFO] [timer.py:259:stop] epoch=0/micro_step=33930/global_step=33930, RunningAvgSamplesPerSec=2.634417208295991, CurrSamplesPerSec=2.628276313297322, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:37:35,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=33940, skipped=0, lr=[7.223367052926312e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:37:35,470] [INFO] [timer.py:259:stop] epoch=0/micro_step=33940/global_step=33940, RunningAvgSamplesPerSec=2.634416227792072, CurrSamplesPerSec=2.6403630719297135, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:37:50,655] [INFO] [logging.py:96:log_dist] [Rank 0] step=33950, skipped=0, lr=[7.2218578084880355e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:37:50,657] [INFO] [timer.py:259:stop] epoch=0/micro_step=33950/global_step=33950, RunningAvgSamplesPerSec=2.634418983113387, CurrSamplesPerSec=2.6358116831792926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:38:05,876] [INFO] [logging.py:96:log_dist] [Rank 0] step=33960, skipped=0, lr=[7.22034831175709e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:38:05,878] [INFO] [timer.py:259:stop] epoch=0/micro_step=33960/global_step=33960, RunningAvgSamplesPerSec=2.6344207599659373, CurrSamplesPerSec=2.6457990473717103, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:38:21,165] [INFO] [logging.py:96:log_dist] [Rank 0] step=33970, skipped=0, lr=[7.218838562904873e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:38:21,166] [INFO] [timer.py:259:stop] epoch=0/micro_step=33970/global_step=33970, RunningAvgSamplesPerSec=2.63441986962919, CurrSamplesPerSec=2.6313646876422365, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:38:36,404] [INFO] [logging.py:96:log_dist] [Rank 0] step=33980, skipped=0, lr=[7.217328562102823e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:38:36,406] [INFO] [timer.py:259:stop] epoch=0/micro_step=33980/global_step=33980, RunningAvgSamplesPerSec=2.634421560636079, CurrSamplesPerSec=2.6365738538748595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:38:51,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=33990, skipped=0, lr=[7.2158183095224e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:38:51,707] [INFO] [timer.py:259:stop] epoch=0/micro_step=33990/global_step=33990, RunningAvgSamplesPerSec=2.6344195250571087, CurrSamplesPerSec=2.634982908720584, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:39:06,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=34000, skipped=0, lr=[7.214307805335092e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:39:06,971] [INFO] [timer.py:259:stop] epoch=0/micro_step=34000/global_step=34000, RunningAvgSamplesPerSec=2.634419162814141, CurrSamplesPerSec=2.6367425022549438, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:39:22,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=34010, skipped=0, lr=[7.212797049712418e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:39:22,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=34010/global_step=34010, RunningAvgSamplesPerSec=2.6344188528088055, CurrSamplesPerSec=2.6391710254611396, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:39:37,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=34020, skipped=0, lr=[7.211286042825926e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:39:37,504] [INFO] [timer.py:259:stop] epoch=0/micro_step=34020/global_step=34020, RunningAvgSamplesPerSec=2.6344197403530374, CurrSamplesPerSec=2.605290377177853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:39:52,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=34030, skipped=0, lr=[7.20977478484719e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:39:52,750] [INFO] [timer.py:259:stop] epoch=0/micro_step=34030/global_step=34030, RunningAvgSamplesPerSec=2.6344198724363004, CurrSamplesPerSec=2.6397527923929576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:40:07,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=34040, skipped=0, lr=[7.208263275947813e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:40:07,966] [INFO] [timer.py:259:stop] epoch=0/micro_step=34040/global_step=34040, RunningAvgSamplesPerSec=2.634421992224369, CurrSamplesPerSec=2.6348931078303597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:40:23,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=34050, skipped=0, lr=[7.20675151629943e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:40:23,234] [INFO] [timer.py:259:stop] epoch=0/micro_step=34050/global_step=34050, RunningAvgSamplesPerSec=2.6344213317604144, CurrSamplesPerSec=2.6368821611002007, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:40:38,580] [INFO] [logging.py:96:log_dist] [Rank 0] step=34060, skipped=0, lr=[7.205239506073698e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:40:38,582] [INFO] [timer.py:259:stop] epoch=0/micro_step=34060/global_step=34060, RunningAvgSamplesPerSec=2.6344175832423393, CurrSamplesPerSec=2.598514526697312, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:40:53,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=34070, skipped=0, lr=[7.203727245442309e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:40:53,903] [INFO] [timer.py:259:stop] epoch=0/micro_step=34070/global_step=34070, RunningAvgSamplesPerSec=2.6344156907767835, CurrSamplesPerSec=2.6358945064741204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:41:09,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=34080, skipped=0, lr=[7.20221473457698e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:41:09,191] [INFO] [timer.py:259:stop] epoch=0/micro_step=34080/global_step=34080, RunningAvgSamplesPerSec=2.6344149592463317, CurrSamplesPerSec=2.630977212518913, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:41:24,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=34090, skipped=0, lr=[7.20070197364946e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:41:24,495] [INFO] [timer.py:259:stop] epoch=0/micro_step=34090/global_step=34090, RunningAvgSamplesPerSec=2.634413805039894, CurrSamplesPerSec=2.6481665404107595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:41:39,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=34100, skipped=0, lr=[7.199188962831518e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:41:39,756] [INFO] [timer.py:259:stop] epoch=0/micro_step=34100/global_step=34100, RunningAvgSamplesPerSec=2.634413695765344, CurrSamplesPerSec=2.5994908761642943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:41:55,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=34110, skipped=0, lr=[7.19767570229496e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:41:55,022] [INFO] [timer.py:259:stop] epoch=0/micro_step=34110/global_step=34110, RunningAvgSamplesPerSec=2.6344127349219413, CurrSamplesPerSec=2.6219938165900225, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:42:10,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=34120, skipped=0, lr=[7.196162192211615e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:42:10,275] [INFO] [timer.py:259:stop] epoch=0/micro_step=34120/global_step=34120, RunningAvgSamplesPerSec=2.6344132147429487, CurrSamplesPerSec=2.6344247526473428, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:42:25,538] [INFO] [logging.py:96:log_dist] [Rank 0] step=34130, skipped=0, lr=[7.194648432753347e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:42:25,539] [INFO] [timer.py:259:stop] epoch=0/micro_step=34130/global_step=34130, RunningAvgSamplesPerSec=2.6344136465020958, CurrSamplesPerSec=2.6498789169385577, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:42:40,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=34140, skipped=0, lr=[7.193134424092041e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:42:40,813] [INFO] [timer.py:259:stop] epoch=0/micro_step=34140/global_step=34140, RunningAvgSamplesPerSec=2.6344128452919118, CurrSamplesPerSec=2.6429743794453224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:42:56,154] [INFO] [logging.py:96:log_dist] [Rank 0] step=34150, skipped=0, lr=[7.191620166399614e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:42:56,156] [INFO] [timer.py:259:stop] epoch=0/micro_step=34150/global_step=34150, RunningAvgSamplesPerSec=2.634409336987353, CurrSamplesPerSec=2.646262273261147, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:43:11,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=34160, skipped=0, lr=[7.190105659848008e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:43:11,439] [INFO] [timer.py:259:stop] epoch=0/micro_step=34160/global_step=34160, RunningAvgSamplesPerSec=2.6344085534961073, CurrSamplesPerSec=2.6375371387744555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:43:26,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=34170, skipped=0, lr=[7.188590904609197e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:43:26,700] [INFO] [timer.py:259:stop] epoch=0/micro_step=34170/global_step=34170, RunningAvgSamplesPerSec=2.6344091512604333, CurrSamplesPerSec=2.6305708782703645, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:43:41,938] [INFO] [logging.py:96:log_dist] [Rank 0] step=34180, skipped=0, lr=[7.187075900855185e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:43:41,939] [INFO] [timer.py:259:stop] epoch=0/micro_step=34180/global_step=34180, RunningAvgSamplesPerSec=2.634410048895076, CurrSamplesPerSec=2.6510379295485453, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:43:57,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=34190, skipped=0, lr=[7.185560648757997e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:43:57,219] [INFO] [timer.py:259:stop] epoch=0/micro_step=34190/global_step=34190, RunningAvgSamplesPerSec=2.634409216225715, CurrSamplesPerSec=2.604409928570622, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:44:12,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=34200, skipped=0, lr=[7.184045148489693e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:44:12,433] [INFO] [timer.py:259:stop] epoch=0/micro_step=34200/global_step=34200, RunningAvgSamplesPerSec=2.634412116883359, CurrSamplesPerSec=2.6489404745706513, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:44:27,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=34210, skipped=0, lr=[7.182529400222356e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:44:27,653] [INFO] [timer.py:259:stop] epoch=0/micro_step=34210/global_step=34210, RunningAvgSamplesPerSec=2.6344137870109785, CurrSamplesPerSec=2.629062146451581, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:44:42,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=34220, skipped=0, lr=[7.181013404128101e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:44:42,881] [INFO] [timer.py:259:stop] epoch=0/micro_step=34220/global_step=34220, RunningAvgSamplesPerSec=2.6344148145114756, CurrSamplesPerSec=2.639793496608776, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:44:58,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=34230, skipped=0, lr=[7.179497160379073e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:44:58,119] [INFO] [timer.py:259:stop] epoch=0/micro_step=34230/global_step=34230, RunningAvgSamplesPerSec=2.6344155114279486, CurrSamplesPerSec=2.6466425735546055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:45:13,381] [INFO] [logging.py:96:log_dist] [Rank 0] step=34240, skipped=0, lr=[7.177980669147438e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:45:13,391] [INFO] [timer.py:259:stop] epoch=0/micro_step=34240/global_step=34240, RunningAvgSamplesPerSec=2.63441427844839, CurrSamplesPerSec=2.6352519341938447, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:45:28,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=34250, skipped=0, lr=[7.176463930605394e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:45:28,628] [INFO] [timer.py:259:stop] epoch=0/micro_step=34250/global_step=34250, RunningAvgSamplesPerSec=2.634414979136264, CurrSamplesPerSec=2.6298966816733427, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:45:43,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=34260, skipped=0, lr=[7.174946944925167e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:45:43,849] [INFO] [timer.py:259:stop] epoch=0/micro_step=34260/global_step=34260, RunningAvgSamplesPerSec=2.634416308763965, CurrSamplesPerSec=2.6264926354671636, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:45:59,113] [INFO] [logging.py:96:log_dist] [Rank 0] step=34270, skipped=0, lr=[7.173429712279015e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:45:59,115] [INFO] [timer.py:259:stop] epoch=0/micro_step=34270/global_step=34270, RunningAvgSamplesPerSec=2.634416019882446, CurrSamplesPerSec=2.572942764922812, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:46:14,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=34280, skipped=0, lr=[7.171912232839218e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:46:14,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=34280/global_step=34280, RunningAvgSamplesPerSec=2.6344166008702805, CurrSamplesPerSec=2.6412097877633087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:46:29,581] [INFO] [logging.py:96:log_dist] [Rank 0] step=34290, skipped=0, lr=[7.170394506778085e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:46:29,583] [INFO] [timer.py:259:stop] epoch=0/micro_step=34290/global_step=34290, RunningAvgSamplesPerSec=2.634417498379889, CurrSamplesPerSec=2.6032485017864078, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:46:44,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=34300, skipped=0, lr=[7.168876534267957e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:46:44,805] [INFO] [timer.py:259:stop] epoch=0/micro_step=34300/global_step=34300, RunningAvgSamplesPerSec=2.634419245047503, CurrSamplesPerSec=2.6400684906237832, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:47:00,059] [INFO] [logging.py:96:log_dist] [Rank 0] step=34310, skipped=0, lr=[7.167358315481195e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:47:00,060] [INFO] [timer.py:259:stop] epoch=0/micro_step=34310/global_step=34310, RunningAvgSamplesPerSec=2.634418504108047, CurrSamplesPerSec=2.632923167623496, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:47:15,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=34320, skipped=0, lr=[7.165839850590199e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:47:15,314] [INFO] [timer.py:259:stop] epoch=0/micro_step=34320/global_step=34320, RunningAvgSamplesPerSec=2.6344187697760284, CurrSamplesPerSec=2.6288767661014054, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:47:30,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=34330, skipped=0, lr=[7.16432113976739e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:47:30,597] [INFO] [timer.py:259:stop] epoch=0/micro_step=34330/global_step=34330, RunningAvgSamplesPerSec=2.6344180321544592, CurrSamplesPerSec=2.6376831021735136, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:47:46,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=34340, skipped=0, lr=[7.162802183185216e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:47:46,228] [INFO] [timer.py:259:stop] epoch=0/micro_step=34340/global_step=34340, RunningAvgSamplesPerSec=2.6344014874312545, CurrSamplesPerSec=2.5707885586876045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:48:01,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=34350, skipped=0, lr=[7.161282981016157e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:48:01,748] [INFO] [timer.py:259:stop] epoch=0/micro_step=34350/global_step=34350, RunningAvgSamplesPerSec=2.6343885416291575, CurrSamplesPerSec=2.607391377578187, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:48:17,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=34360, skipped=0, lr=[7.159763533432716e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:48:17,069] [INFO] [timer.py:259:stop] epoch=0/micro_step=34360/global_step=34360, RunningAvgSamplesPerSec=2.634387139851641, CurrSamplesPerSec=2.6327475708867425, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:48:32,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=34370, skipped=0, lr=[7.15824384060743e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:48:32,319] [INFO] [timer.py:259:stop] epoch=0/micro_step=34370/global_step=34370, RunningAvgSamplesPerSec=2.634388123214216, CurrSamplesPerSec=2.6322386792669263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:48:47,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=34380, skipped=0, lr=[7.156723902712861e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:48:47,619] [INFO] [timer.py:259:stop] epoch=0/micro_step=34380/global_step=34380, RunningAvgSamplesPerSec=2.6343867105944727, CurrSamplesPerSec=2.63635634175574, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:49:02,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=34390, skipped=0, lr=[7.155203719921595e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:49:02,878] [INFO] [timer.py:259:stop] epoch=0/micro_step=34390/global_step=34390, RunningAvgSamplesPerSec=2.634386547258701, CurrSamplesPerSec=2.652688171643825, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:49:18,092] [INFO] [logging.py:96:log_dist] [Rank 0] step=34400, skipped=0, lr=[7.153683292406252e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:49:18,094] [INFO] [timer.py:259:stop] epoch=0/micro_step=34400/global_step=34400, RunningAvgSamplesPerSec=2.63438795244247, CurrSamplesPerSec=2.65062328186036, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:49:33,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=34410, skipped=0, lr=[7.152162620339473e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:49:33,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=34410/global_step=34410, RunningAvgSamplesPerSec=2.634386164728934, CurrSamplesPerSec=2.642451954527953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:49:48,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=34420, skipped=0, lr=[7.150641703893938e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:49:48,666] [INFO] [timer.py:259:stop] epoch=0/micro_step=34420/global_step=34420, RunningAvgSamplesPerSec=2.634386001357678, CurrSamplesPerSec=2.629486972017829, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:50:03,847] [INFO] [logging.py:96:log_dist] [Rank 0] step=34430, skipped=0, lr=[7.1491205432423424e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:50:03,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=34430/global_step=34430, RunningAvgSamplesPerSec=2.6343889496932973, CurrSamplesPerSec=2.63955967214988, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:50:19,105] [INFO] [logging.py:96:log_dist] [Rank 0] step=34440, skipped=0, lr=[7.1475991385574155e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:50:19,106] [INFO] [timer.py:259:stop] epoch=0/micro_step=34440/global_step=34440, RunningAvgSamplesPerSec=2.63438972682748, CurrSamplesPerSec=2.6003482525939887, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:50:34,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=34450, skipped=0, lr=[7.146077490011913e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:50:34,381] [INFO] [timer.py:259:stop] epoch=0/micro_step=34450/global_step=34450, RunningAvgSamplesPerSec=2.634388257663065, CurrSamplesPerSec=2.6279741315201557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:50:49,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=34460, skipped=0, lr=[7.144555597778617e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:50:49,600] [INFO] [timer.py:259:stop] epoch=0/micro_step=34460/global_step=34460, RunningAvgSamplesPerSec=2.6343897856063614, CurrSamplesPerSec=2.626231972920686, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:51:04,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=34470, skipped=0, lr=[7.143033462030345e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:51:04,911] [INFO] [timer.py:259:stop] epoch=0/micro_step=34470/global_step=34470, RunningAvgSamplesPerSec=2.6343868117793416, CurrSamplesPerSec=2.615654827762629, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:51:20,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=34480, skipped=0, lr=[7.141511082939928e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:51:20,252] [INFO] [timer.py:259:stop] epoch=0/micro_step=34480/global_step=34480, RunningAvgSamplesPerSec=2.634382578491847, CurrSamplesPerSec=2.6186007844300367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:51:35,789] [INFO] [logging.py:96:log_dist] [Rank 0] step=34490, skipped=0, lr=[7.139988460680238e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:51:35,791] [INFO] [timer.py:259:stop] epoch=0/micro_step=34490/global_step=34490, RunningAvgSamplesPerSec=2.6343691031653442, CurrSamplesPerSec=2.6338680734655444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:51:51,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=34500, skipped=0, lr=[7.138465595424168e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:51:51,120] [INFO] [timer.py:259:stop] epoch=0/micro_step=34500/global_step=34500, RunningAvgSamplesPerSec=2.6343653722630913, CurrSamplesPerSec=2.620844497321926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:52:06,390] [INFO] [logging.py:96:log_dist] [Rank 0] step=34510, skipped=0, lr=[7.136942487344639e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:52:06,391] [INFO] [timer.py:259:stop] epoch=0/micro_step=34510/global_step=34510, RunningAvgSamplesPerSec=2.634364544766525, CurrSamplesPerSec=2.634020247673432, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:52:21,682] [INFO] [logging.py:96:log_dist] [Rank 0] step=34520, skipped=0, lr=[7.135419136614601e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:52:21,687] [INFO] [timer.py:259:stop] epoch=0/micro_step=34520/global_step=34520, RunningAvgSamplesPerSec=2.6343625922901595, CurrSamplesPerSec=2.63011024294762, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:52:36,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=34530, skipped=0, lr=[7.133895543407032e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:52:36,980] [INFO] [timer.py:259:stop] epoch=0/micro_step=34530/global_step=34530, RunningAvgSamplesPerSec=2.634360874558908, CurrSamplesPerSec=2.6269067594861104, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:52:52,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=34540, skipped=0, lr=[7.132371707894934e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:52:52,272] [INFO] [timer.py:259:stop] epoch=0/micro_step=34540/global_step=34540, RunningAvgSamplesPerSec=2.63435906941346, CurrSamplesPerSec=2.6248809752120557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:53:07,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=34550, skipped=0, lr=[7.13084763025134e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:53:07,529] [INFO] [timer.py:259:stop] epoch=0/micro_step=34550/global_step=34550, RunningAvgSamplesPerSec=2.634359115603442, CurrSamplesPerSec=2.6369953080446678, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:53:22,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=34560, skipped=0, lr=[7.129323310649311e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:53:22,827] [INFO] [timer.py:259:stop] epoch=0/micro_step=34560/global_step=34560, RunningAvgSamplesPerSec=2.634357270647039, CurrSamplesPerSec=2.6406905539670547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:53:38,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=34570, skipped=0, lr=[7.127798749261934e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:53:38,077] [INFO] [timer.py:259:stop] epoch=0/micro_step=34570/global_step=34570, RunningAvgSamplesPerSec=2.634358415490513, CurrSamplesPerSec=2.635856407112048, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:53:53,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=34580, skipped=0, lr=[7.126273946262321e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:53:53,353] [INFO] [timer.py:259:stop] epoch=0/micro_step=34580/global_step=34580, RunningAvgSamplesPerSec=2.634358494332225, CurrSamplesPerSec=2.6426933682612317, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:54:08,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=34590, skipped=0, lr=[7.124748901823615e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:54:08,639] [INFO] [timer.py:259:stop] epoch=0/micro_step=34590/global_step=34590, RunningAvgSamplesPerSec=2.634356930500635, CurrSamplesPerSec=2.6144332395438297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:54:23,893] [INFO] [logging.py:96:log_dist] [Rank 0] step=34600, skipped=0, lr=[7.123223616118985e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:54:23,894] [INFO] [timer.py:259:stop] epoch=0/micro_step=34600/global_step=34600, RunningAvgSamplesPerSec=2.634356783677745, CurrSamplesPerSec=2.652225628460789, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:54:39,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=34610, skipped=0, lr=[7.1216980893216295e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:54:39,167] [INFO] [timer.py:259:stop] epoch=0/micro_step=34610/global_step=34610, RunningAvgSamplesPerSec=2.634355759782086, CurrSamplesPerSec=2.6143432043022368, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:54:54,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=34620, skipped=0, lr=[7.12017232160477e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:54:54,408] [INFO] [timer.py:259:stop] epoch=0/micro_step=34620/global_step=34620, RunningAvgSamplesPerSec=2.6343567913624653, CurrSamplesPerSec=2.642814507976949, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:55:09,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=34630, skipped=0, lr=[7.118646313141661e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:55:09,681] [INFO] [timer.py:259:stop] epoch=0/micro_step=34630/global_step=34630, RunningAvgSamplesPerSec=2.6343567753661064, CurrSamplesPerSec=2.6118958898777347, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:55:24,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=34640, skipped=0, lr=[7.117120064105578e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:55:24,930] [INFO] [timer.py:259:stop] epoch=0/micro_step=34640/global_step=34640, RunningAvgSamplesPerSec=2.6343578461118917, CurrSamplesPerSec=2.6390473136744452, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:55:40,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=34650, skipped=0, lr=[7.115593574669828e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:55:40,178] [INFO] [timer.py:259:stop] epoch=0/micro_step=34650/global_step=34650, RunningAvgSamplesPerSec=2.6343585192118972, CurrSamplesPerSec=2.6233648142871275, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:55:55,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=34660, skipped=0, lr=[7.114066845007746e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:55:55,440] [INFO] [timer.py:259:stop] epoch=0/micro_step=34660/global_step=34660, RunningAvgSamplesPerSec=2.634358955692124, CurrSamplesPerSec=2.654102381557997, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:56:10,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=34670, skipped=0, lr=[7.112539875292692e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:56:10,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=34670/global_step=34670, RunningAvgSamplesPerSec=2.63436007730006, CurrSamplesPerSec=2.6416140085342126, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:56:25,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=34680, skipped=0, lr=[7.1110126656980516e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:56:25,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=34680/global_step=34680, RunningAvgSamplesPerSec=2.6343640341109547, CurrSamplesPerSec=2.646102421136328, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:56:41,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=34690, skipped=0, lr=[7.109485216397244e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:56:41,142] [INFO] [timer.py:259:stop] epoch=0/micro_step=34690/global_step=34690, RunningAvgSamplesPerSec=2.6343626149940413, CurrSamplesPerSec=2.6388024151519036, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:56:56,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=34700, skipped=0, lr=[7.107957527563708e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:56:56,339] [INFO] [timer.py:259:stop] epoch=0/micro_step=34700/global_step=34700, RunningAvgSamplesPerSec=2.6343656547853347, CurrSamplesPerSec=2.638049740587987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:57:11,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=34710, skipped=0, lr=[7.106429599370915e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:57:11,669] [INFO] [timer.py:259:stop] epoch=0/micro_step=34710/global_step=34710, RunningAvgSamplesPerSec=2.6343630446980884, CurrSamplesPerSec=2.6270634779999464, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:57:26,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=34720, skipped=0, lr=[7.10490143199236e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:57:26,972] [INFO] [timer.py:259:stop] epoch=0/micro_step=34720/global_step=34720, RunningAvgSamplesPerSec=2.634361562993428, CurrSamplesPerSec=2.624877689808084, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:57:42,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=34730, skipped=0, lr=[7.10337302560157e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:57:42,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=34730/global_step=34730, RunningAvgSamplesPerSec=2.6343601996702395, CurrSamplesPerSec=2.6426421682152457, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:57:57,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=34740, skipped=0, lr=[7.101844380372093e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:57:57,537] [INFO] [timer.py:259:stop] epoch=0/micro_step=34740/global_step=34740, RunningAvgSamplesPerSec=2.634359466177645, CurrSamplesPerSec=2.626332284873969, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:58:12,812] [INFO] [logging.py:96:log_dist] [Rank 0] step=34750, skipped=0, lr=[7.100315496477507e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:58:12,814] [INFO] [timer.py:259:stop] epoch=0/micro_step=34750/global_step=34750, RunningAvgSamplesPerSec=2.6343592845458894, CurrSamplesPerSec=2.6427853668247088, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:58:28,045] [INFO] [logging.py:96:log_dist] [Rank 0] step=34760, skipped=0, lr=[7.09878637409142e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:58:28,046] [INFO] [timer.py:259:stop] epoch=0/micro_step=34760/global_step=34760, RunningAvgSamplesPerSec=2.6343602966194286, CurrSamplesPerSec=2.653708601384271, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:58:43,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=34770, skipped=0, lr=[7.097257013387464e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:58:43,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=34770/global_step=34770, RunningAvgSamplesPerSec=2.6343618871449266, CurrSamplesPerSec=2.64370153900914, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:58:58,507] [INFO] [logging.py:96:log_dist] [Rank 0] step=34780, skipped=0, lr=[7.095727414539296e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:58:58,517] [INFO] [timer.py:259:stop] epoch=0/micro_step=34780/global_step=34780, RunningAvgSamplesPerSec=2.6343630306386623, CurrSamplesPerSec=2.637492772420107, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:59:13,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=34790, skipped=0, lr=[7.094197577720604e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:59:13,766] [INFO] [timer.py:259:stop] epoch=0/micro_step=34790/global_step=34790, RunningAvgSamplesPerSec=2.634363538472732, CurrSamplesPerSec=2.6337324547517404, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:59:28,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=34800, skipped=0, lr=[7.092667503105099e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:59:28,954] [INFO] [timer.py:259:stop] epoch=0/micro_step=34800/global_step=34800, RunningAvgSamplesPerSec=2.634366693700055, CurrSamplesPerSec=2.6298105248780606, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:59:44,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=34810, skipped=0, lr=[7.091137190866528e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:59:44,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=34810/global_step=34810, RunningAvgSamplesPerSec=2.6343678237677546, CurrSamplesPerSec=2.634229929529498, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 14:59:59,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=34820, skipped=0, lr=[7.089606641178652e-06], mom=[(0.9, 0.95)] +[2024-11-01 14:59:59,461] [INFO] [timer.py:259:stop] epoch=0/micro_step=34820/global_step=34820, RunningAvgSamplesPerSec=2.634366662371377, CurrSamplesPerSec=2.5785573050335557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:00:14,676] [INFO] [logging.py:96:log_dist] [Rank 0] step=34830, skipped=0, lr=[7.0880758542152665e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:00:14,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=34830/global_step=34830, RunningAvgSamplesPerSec=2.634368571555755, CurrSamplesPerSec=2.631247896650077, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:00:29,932] [INFO] [logging.py:96:log_dist] [Rank 0] step=34840, skipped=0, lr=[7.086544830150195e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:00:29,933] [INFO] [timer.py:259:stop] epoch=0/micro_step=34840/global_step=34840, RunningAvgSamplesPerSec=2.6343691834239027, CurrSamplesPerSec=2.6181700715464395, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:00:45,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=34850, skipped=0, lr=[7.085013569157285e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:00:45,126] [INFO] [timer.py:259:stop] epoch=0/micro_step=34850/global_step=34850, RunningAvgSamplesPerSec=2.6343722400016887, CurrSamplesPerSec=2.6658572665268903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:01:00,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=34860, skipped=0, lr=[7.083482071410411e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:01:00,310] [INFO] [timer.py:259:stop] epoch=0/micro_step=34860/global_step=34860, RunningAvgSamplesPerSec=2.6343754652398164, CurrSamplesPerSec=2.655503809416667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:01:15,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=34870, skipped=0, lr=[7.081950337083475e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:01:15,479] [INFO] [timer.py:259:stop] epoch=0/micro_step=34870/global_step=34870, RunningAvgSamplesPerSec=2.634380182963478, CurrSamplesPerSec=2.647807531709192, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:01:30,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=34880, skipped=0, lr=[7.080418366350408e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:01:30,772] [INFO] [timer.py:259:stop] epoch=0/micro_step=34880/global_step=34880, RunningAvgSamplesPerSec=2.634379199345807, CurrSamplesPerSec=2.6028825882236633, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:01:45,996] [INFO] [logging.py:96:log_dist] [Rank 0] step=34890, skipped=0, lr=[7.078886159385162e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:01:46,008] [INFO] [timer.py:259:stop] epoch=0/micro_step=34890/global_step=34890, RunningAvgSamplesPerSec=2.6343813321998746, CurrSamplesPerSec=2.6382667024848367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:02:01,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=34900, skipped=0, lr=[7.0773537163617225e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:02:01,255] [INFO] [timer.py:259:stop] epoch=0/micro_step=34900/global_step=34900, RunningAvgSamplesPerSec=2.63438213964958, CurrSamplesPerSec=2.65725178466691, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:02:16,406] [INFO] [logging.py:96:log_dist] [Rank 0] step=34910, skipped=0, lr=[7.075821037454098e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:02:16,410] [INFO] [timer.py:259:stop] epoch=0/micro_step=34910/global_step=34910, RunningAvgSamplesPerSec=2.6343861300683713, CurrSamplesPerSec=2.6445662382509996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:02:31,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=34920, skipped=0, lr=[7.074288122836326e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:02:31,682] [INFO] [timer.py:259:stop] epoch=0/micro_step=34920/global_step=34920, RunningAvgSamplesPerSec=2.6343853417040277, CurrSamplesPerSec=2.631934347063648, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:02:46,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=34930, skipped=0, lr=[7.072754972682467e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:02:46,937] [INFO] [timer.py:259:stop] epoch=0/micro_step=34930/global_step=34930, RunningAvgSamplesPerSec=2.634385704963317, CurrSamplesPerSec=2.656109620212593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:03:02,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=34940, skipped=0, lr=[7.071221587166611e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:03:02,187] [INFO] [timer.py:259:stop] epoch=0/micro_step=34940/global_step=34940, RunningAvgSamplesPerSec=2.6343862924482404, CurrSamplesPerSec=2.637465821625188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:03:17,451] [INFO] [logging.py:96:log_dist] [Rank 0] step=34950, skipped=0, lr=[7.069687966462877e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:03:17,453] [INFO] [timer.py:259:stop] epoch=0/micro_step=34950/global_step=34950, RunningAvgSamplesPerSec=2.6343871671726076, CurrSamplesPerSec=2.6500982476239634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:03:32,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=34960, skipped=0, lr=[7.068154110745407e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:03:32,734] [INFO] [timer.py:259:stop] epoch=0/micro_step=34960/global_step=34960, RunningAvgSamplesPerSec=2.634386875639089, CurrSamplesPerSec=2.637603069104749, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:03:47,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=34970, skipped=0, lr=[7.066620020188369e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:03:47,994] [INFO] [timer.py:259:stop] epoch=0/micro_step=34970/global_step=34970, RunningAvgSamplesPerSec=2.6343875353924155, CurrSamplesPerSec=2.641643123859646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:04:03,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=34980, skipped=0, lr=[7.065085694965962e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:04:03,399] [INFO] [timer.py:259:stop] epoch=0/micro_step=34980/global_step=34980, RunningAvgSamplesPerSec=2.634381854021191, CurrSamplesPerSec=2.6423703833060257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:04:18,671] [INFO] [logging.py:96:log_dist] [Rank 0] step=34990, skipped=0, lr=[7.063551135252406e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:04:18,674] [INFO] [timer.py:259:stop] epoch=0/micro_step=34990/global_step=34990, RunningAvgSamplesPerSec=2.6343810576764746, CurrSamplesPerSec=2.616437618892334, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:04:34,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=35000, skipped=0, lr=[7.062016341221953e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:04:34,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=35000/global_step=35000, RunningAvgSamplesPerSec=2.6343756825986415, CurrSamplesPerSec=2.611225540814709, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:04:49,444] [INFO] [logging.py:96:log_dist] [Rank 0] step=35010, skipped=0, lr=[7.0604813130488804e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:04:49,455] [INFO] [timer.py:259:stop] epoch=0/micro_step=35010/global_step=35010, RunningAvgSamplesPerSec=2.6343722952591886, CurrSamplesPerSec=2.545122051950437, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:05:04,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=35020, skipped=0, lr=[7.058946050907489e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:05:04,884] [INFO] [timer.py:259:stop] epoch=0/micro_step=35020/global_step=35020, RunningAvgSamplesPerSec=2.6343676813042123, CurrSamplesPerSec=2.6169314383563185, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:05:20,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=35030, skipped=0, lr=[7.0574105549721096e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:05:20,272] [INFO] [timer.py:259:stop] epoch=0/micro_step=35030/global_step=35030, RunningAvgSamplesPerSec=2.6343627646170793, CurrSamplesPerSec=2.5880871613002734, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:05:35,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=35040, skipped=0, lr=[7.055874825417097e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:05:35,573] [INFO] [timer.py:259:stop] epoch=0/micro_step=35040/global_step=35040, RunningAvgSamplesPerSec=2.6343612097649496, CurrSamplesPerSec=2.62571984362826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:05:50,946] [INFO] [logging.py:96:log_dist] [Rank 0] step=35050, skipped=0, lr=[7.054338862416835e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:05:50,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=35050/global_step=35050, RunningAvgSamplesPerSec=2.6343573793626427, CurrSamplesPerSec=2.6215791918034204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:06:06,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=35060, skipped=0, lr=[7.052802666145733e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:06:06,253] [INFO] [timer.py:259:stop] epoch=0/micro_step=35060/global_step=35060, RunningAvgSamplesPerSec=2.634356588359168, CurrSamplesPerSec=2.6383770639099255, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:06:21,581] [INFO] [logging.py:96:log_dist] [Rank 0] step=35070, skipped=0, lr=[7.051266236778224e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:06:21,583] [INFO] [timer.py:259:stop] epoch=0/micro_step=35070/global_step=35070, RunningAvgSamplesPerSec=2.634354163299803, CurrSamplesPerSec=2.643629888112743, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:06:36,841] [INFO] [logging.py:96:log_dist] [Rank 0] step=35080, skipped=0, lr=[7.049729574488773e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:06:36,842] [INFO] [timer.py:259:stop] epoch=0/micro_step=35080/global_step=35080, RunningAvgSamplesPerSec=2.6343547858722722, CurrSamplesPerSec=2.654443360271551, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:06:52,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=35090, skipped=0, lr=[7.048192679451867e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:06:52,104] [INFO] [timer.py:259:stop] epoch=0/micro_step=35090/global_step=35090, RunningAvgSamplesPerSec=2.634354810786321, CurrSamplesPerSec=2.640471530968416, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:07:07,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=35100, skipped=0, lr=[7.04665555184202e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:07:07,364] [INFO] [timer.py:259:stop] epoch=0/micro_step=35100/global_step=35100, RunningAvgSamplesPerSec=2.634355986984005, CurrSamplesPerSec=2.6511669575113412, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:07:22,691] [INFO] [logging.py:96:log_dist] [Rank 0] step=35110, skipped=0, lr=[7.045118191833777e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:07:22,697] [INFO] [timer.py:259:stop] epoch=0/micro_step=35110/global_step=35110, RunningAvgSamplesPerSec=2.6343530509255735, CurrSamplesPerSec=2.6362076255571547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:07:38,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=35120, skipped=0, lr=[7.043580599601703e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:07:38,022] [INFO] [timer.py:259:stop] epoch=0/micro_step=35120/global_step=35120, RunningAvgSamplesPerSec=2.6343512517772747, CurrSamplesPerSec=2.6437436148920166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:07:53,319] [INFO] [logging.py:96:log_dist] [Rank 0] step=35130, skipped=0, lr=[7.042042775320391e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:07:53,320] [INFO] [timer.py:259:stop] epoch=0/micro_step=35130/global_step=35130, RunningAvgSamplesPerSec=2.634350164639206, CurrSamplesPerSec=2.6410605235128624, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:08:08,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=35140, skipped=0, lr=[7.040504719164463e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:08:08,586] [INFO] [timer.py:259:stop] epoch=0/micro_step=35140/global_step=35140, RunningAvgSamplesPerSec=2.6343501492188777, CurrSamplesPerSec=2.6047111634735605, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:08:23,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=35150, skipped=0, lr=[7.038966431308566e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:08:23,891] [INFO] [timer.py:259:stop] epoch=0/micro_step=35150/global_step=35150, RunningAvgSamplesPerSec=2.6343478828086586, CurrSamplesPerSec=2.6193595759880015, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:08:39,169] [INFO] [logging.py:96:log_dist] [Rank 0] step=35160, skipped=0, lr=[7.037427911927372e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:08:39,171] [INFO] [timer.py:259:stop] epoch=0/micro_step=35160/global_step=35160, RunningAvgSamplesPerSec=2.634347959403536, CurrSamplesPerSec=2.581626004701515, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:08:54,384] [INFO] [logging.py:96:log_dist] [Rank 0] step=35170, skipped=0, lr=[7.035889161195582e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:08:54,385] [INFO] [timer.py:259:stop] epoch=0/micro_step=35170/global_step=35170, RunningAvgSamplesPerSec=2.6343500459532816, CurrSamplesPerSec=2.6533275267449117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:09:09,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=35180, skipped=0, lr=[7.034350179287921e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:09:09,617] [INFO] [timer.py:259:stop] epoch=0/micro_step=35180/global_step=35180, RunningAvgSamplesPerSec=2.6343512294807057, CurrSamplesPerSec=2.61858566212802, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:09:24,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=35190, skipped=0, lr=[7.0328109663791396e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:09:24,859] [INFO] [timer.py:259:stop] epoch=0/micro_step=35190/global_step=35190, RunningAvgSamplesPerSec=2.6343522561087385, CurrSamplesPerSec=2.6441965361615334, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:09:40,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=35200, skipped=0, lr=[7.031271522644018e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:09:40,142] [INFO] [timer.py:259:stop] epoch=0/micro_step=35200/global_step=35200, RunningAvgSamplesPerSec=2.6343510883949612, CurrSamplesPerSec=2.629050610911454, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:09:55,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=35210, skipped=0, lr=[7.029731848257359e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:09:55,430] [INFO] [timer.py:259:stop] epoch=0/micro_step=35210/global_step=35210, RunningAvgSamplesPerSec=2.634350450971993, CurrSamplesPerSec=2.6318142027084224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:10:10,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=35220, skipped=0, lr=[7.028191943393995e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:10:10,752] [INFO] [timer.py:259:stop] epoch=0/micro_step=35220/global_step=35220, RunningAvgSamplesPerSec=2.634347138087458, CurrSamplesPerSec=2.640699698032224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:10:26,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=35230, skipped=0, lr=[7.026651808228779e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:10:26,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=35230/global_step=35230, RunningAvgSamplesPerSec=2.6343458955873547, CurrSamplesPerSec=2.650404282964163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:10:41,267] [INFO] [logging.py:96:log_dist] [Rank 0] step=35240, skipped=0, lr=[7.025111442936596e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:10:41,269] [INFO] [timer.py:259:stop] epoch=0/micro_step=35240/global_step=35240, RunningAvgSamplesPerSec=2.6343477210304336, CurrSamplesPerSec=2.604354541280204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:10:56,511] [INFO] [logging.py:96:log_dist] [Rank 0] step=35250, skipped=0, lr=[7.023570847692358e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:10:56,532] [INFO] [timer.py:259:stop] epoch=0/micro_step=35250/global_step=35250, RunningAvgSamplesPerSec=2.6343487169082596, CurrSamplesPerSec=2.6510362539433285, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:11:11,772] [INFO] [logging.py:96:log_dist] [Rank 0] step=35260, skipped=0, lr=[7.0220300226709955e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:11:11,773] [INFO] [timer.py:259:stop] epoch=0/micro_step=35260/global_step=35260, RunningAvgSamplesPerSec=2.6343499011997937, CurrSamplesPerSec=2.6322998019538137, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:11:26,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=35270, skipped=0, lr=[7.020488968047472e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:11:26,958] [INFO] [timer.py:259:stop] epoch=0/micro_step=35270/global_step=35270, RunningAvgSamplesPerSec=2.6343535878377593, CurrSamplesPerSec=2.6302306441271166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:11:42,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=35280, skipped=0, lr=[7.018947683996773e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:11:42,205] [INFO] [timer.py:259:stop] epoch=0/micro_step=35280/global_step=35280, RunningAvgSamplesPerSec=2.6343534609034065, CurrSamplesPerSec=2.663982893434982, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:11:57,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=35290, skipped=0, lr=[7.017406170693913e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:11:57,434] [INFO] [timer.py:259:stop] epoch=0/micro_step=35290/global_step=35290, RunningAvgSamplesPerSec=2.634354987474716, CurrSamplesPerSec=2.63630911529522, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:12:12,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=35300, skipped=0, lr=[7.015864428313932e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:12:12,684] [INFO] [timer.py:259:stop] epoch=0/micro_step=35300/global_step=35300, RunningAvgSamplesPerSec=2.634355775233628, CurrSamplesPerSec=2.653573450055123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:12:27,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=35310, skipped=0, lr=[7.0143224570318946e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:12:27,912] [INFO] [timer.py:259:stop] epoch=0/micro_step=35310/global_step=35310, RunningAvgSamplesPerSec=2.6343574026329852, CurrSamplesPerSec=2.6308835588760724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:12:43,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=35320, skipped=0, lr=[7.0127802570228935e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:12:43,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=35320/global_step=35320, RunningAvgSamplesPerSec=2.6343541222534563, CurrSamplesPerSec=2.637161523178802, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:12:58,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=35330, skipped=0, lr=[7.011237828462043e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:12:58,547] [INFO] [timer.py:259:stop] epoch=0/micro_step=35330/global_step=35330, RunningAvgSamplesPerSec=2.6343521812585684, CurrSamplesPerSec=2.618194586503061, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:13:13,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=35340, skipped=0, lr=[7.0096951715244866e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:13:13,836] [INFO] [timer.py:259:stop] epoch=0/micro_step=35340/global_step=35340, RunningAvgSamplesPerSec=2.634351722867834, CurrSamplesPerSec=2.628656815334956, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:13:29,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=35350, skipped=0, lr=[7.008152286385397e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:13:29,148] [INFO] [timer.py:259:stop] epoch=0/micro_step=35350/global_step=35350, RunningAvgSamplesPerSec=2.6343499965021837, CurrSamplesPerSec=2.5959918993448894, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:13:44,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=35360, skipped=0, lr=[7.006609173219966e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:13:44,396] [INFO] [timer.py:259:stop] epoch=0/micro_step=35360/global_step=35360, RunningAvgSamplesPerSec=2.634350857754282, CurrSamplesPerSec=2.636274317933295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:13:59,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=35370, skipped=0, lr=[7.005065832203417e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:13:59,633] [INFO] [timer.py:259:stop] epoch=0/micro_step=35370/global_step=35370, RunningAvgSamplesPerSec=2.634352093020134, CurrSamplesPerSec=2.611375516267973, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:14:14,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=35380, skipped=0, lr=[7.003522263510995e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:14:14,889] [INFO] [timer.py:259:stop] epoch=0/micro_step=35380/global_step=35380, RunningAvgSamplesPerSec=2.6343530483799884, CurrSamplesPerSec=2.642165228975273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:14:30,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=35390, skipped=0, lr=[7.001978467317974e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:14:30,209] [INFO] [timer.py:259:stop] epoch=0/micro_step=35390/global_step=35390, RunningAvgSamplesPerSec=2.6343501709298547, CurrSamplesPerSec=2.6252765158047806, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:14:45,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=35400, skipped=0, lr=[7.00043444379965e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:14:45,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=35400/global_step=35400, RunningAvgSamplesPerSec=2.634349910563805, CurrSamplesPerSec=2.6373895331304604, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:15:00,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=35410, skipped=0, lr=[6.998890193131352e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:15:00,785] [INFO] [timer.py:259:stop] epoch=0/micro_step=35410/global_step=35410, RunningAvgSamplesPerSec=2.6343491414906435, CurrSamplesPerSec=2.6441811167869274, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:15:16,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=35420, skipped=0, lr=[6.997345715488426e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:15:16,045] [INFO] [timer.py:259:stop] epoch=0/micro_step=35420/global_step=35420, RunningAvgSamplesPerSec=2.6343496814051752, CurrSamplesPerSec=2.6328777168096544, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:15:31,294] [INFO] [logging.py:96:log_dist] [Rank 0] step=35430, skipped=0, lr=[6.995801011046249e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:15:31,296] [INFO] [timer.py:259:stop] epoch=0/micro_step=35430/global_step=35430, RunningAvgSamplesPerSec=2.6343504871729224, CurrSamplesPerSec=2.632609588945571, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:15:46,512] [INFO] [logging.py:96:log_dist] [Rank 0] step=35440, skipped=0, lr=[6.994256079980224e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:15:46,514] [INFO] [timer.py:259:stop] epoch=0/micro_step=35440/global_step=35440, RunningAvgSamplesPerSec=2.634352827191142, CurrSamplesPerSec=2.6660131597027332, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:16:01,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=35450, skipped=0, lr=[6.99271092246578e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:16:01,778] [INFO] [timer.py:259:stop] epoch=0/micro_step=35450/global_step=35450, RunningAvgSamplesPerSec=2.6343535693992366, CurrSamplesPerSec=2.6277909625027656, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:16:17,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=35460, skipped=0, lr=[6.9911655386783635e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:16:17,033] [INFO] [timer.py:259:stop] epoch=0/micro_step=35460/global_step=35460, RunningAvgSamplesPerSec=2.634354993579805, CurrSamplesPerSec=2.6377992209599066, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:16:32,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=35470, skipped=0, lr=[6.989619928793459e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:16:32,326] [INFO] [timer.py:259:stop] epoch=0/micro_step=35470/global_step=35470, RunningAvgSamplesPerSec=2.6343538553350534, CurrSamplesPerSec=2.6361612328508466, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:16:47,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=35480, skipped=0, lr=[6.9880740929865694e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:16:47,681] [INFO] [timer.py:259:stop] epoch=0/micro_step=35480/global_step=35480, RunningAvgSamplesPerSec=2.6343501073918123, CurrSamplesPerSec=2.62678666241502, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:17:03,009] [INFO] [logging.py:96:log_dist] [Rank 0] step=35490, skipped=0, lr=[6.9865280314332265e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:17:03,024] [INFO] [timer.py:259:stop] epoch=0/micro_step=35490/global_step=35490, RunningAvgSamplesPerSec=2.634346460716209, CurrSamplesPerSec=2.6187254478984086, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:17:18,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=35500, skipped=0, lr=[6.984981744308983e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:17:18,386] [INFO] [timer.py:259:stop] epoch=0/micro_step=35500/global_step=35500, RunningAvgSamplesPerSec=2.634342468078749, CurrSamplesPerSec=2.5711502321109085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:17:33,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=35510, skipped=0, lr=[6.9834352317894215e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:17:33,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=35510/global_step=35510, RunningAvgSamplesPerSec=2.6343413200114365, CurrSamplesPerSec=2.642747900573265, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:17:48,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=35520, skipped=0, lr=[6.981888494050151e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:17:48,925] [INFO] [timer.py:259:stop] epoch=0/micro_step=35520/global_step=35520, RunningAvgSamplesPerSec=2.6343420919520875, CurrSamplesPerSec=2.627002598245987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:18:04,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=35530, skipped=0, lr=[6.980341531266801e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:18:04,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=35530/global_step=35530, RunningAvgSamplesPerSec=2.6343423729515143, CurrSamplesPerSec=2.647028830056007, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:18:19,457] [INFO] [logging.py:96:log_dist] [Rank 0] step=35540, skipped=0, lr=[6.978794343615032e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:18:19,462] [INFO] [timer.py:259:stop] epoch=0/micro_step=35540/global_step=35540, RunningAvgSamplesPerSec=2.634342172095209, CurrSamplesPerSec=2.6208874864893446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:18:34,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=35550, skipped=0, lr=[6.9772469312705245e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:18:34,749] [INFO] [timer.py:259:stop] epoch=0/micro_step=35550/global_step=35550, RunningAvgSamplesPerSec=2.6343413347384383, CurrSamplesPerSec=2.6355251547342897, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:18:50,095] [INFO] [logging.py:96:log_dist] [Rank 0] step=35560, skipped=0, lr=[6.9756992944089926e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:18:50,096] [INFO] [timer.py:259:stop] epoch=0/micro_step=35560/global_step=35560, RunningAvgSamplesPerSec=2.6343367551478454, CurrSamplesPerSec=2.5572575526481045, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:19:05,397] [INFO] [logging.py:96:log_dist] [Rank 0] step=35570, skipped=0, lr=[6.9741514332061665e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:19:05,422] [INFO] [timer.py:259:stop] epoch=0/micro_step=35570/global_step=35570, RunningAvgSamplesPerSec=2.634335284466099, CurrSamplesPerSec=2.6292714516883713, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:19:20,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=35580, skipped=0, lr=[6.972603347837808e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:19:20,769] [INFO] [timer.py:259:stop] epoch=0/micro_step=35580/global_step=35580, RunningAvgSamplesPerSec=2.6343312719163334, CurrSamplesPerSec=2.6289088968224075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:19:36,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=35590, skipped=0, lr=[6.971055038479705e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:19:36,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=35590/global_step=35590, RunningAvgSamplesPerSec=2.634329594685236, CurrSamplesPerSec=2.635383569745807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:19:51,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=35600, skipped=0, lr=[6.9695065053076645e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:19:51,394] [INFO] [timer.py:259:stop] epoch=0/micro_step=35600/global_step=35600, RunningAvgSamplesPerSec=2.6343272387171925, CurrSamplesPerSec=2.6359367483586995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:20:06,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=35610, skipped=0, lr=[6.967957748497526e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:20:06,650] [INFO] [timer.py:259:stop] epoch=0/micro_step=35610/global_step=35610, RunningAvgSamplesPerSec=2.634327526227238, CurrSamplesPerSec=2.6487330442831114, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:20:21,943] [INFO] [logging.py:96:log_dist] [Rank 0] step=35620, skipped=0, lr=[6.966408768225149e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:20:21,959] [INFO] [timer.py:259:stop] epoch=0/micro_step=35620/global_step=35620, RunningAvgSamplesPerSec=2.634325597764639, CurrSamplesPerSec=2.626131257562485, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:20:37,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=35630, skipped=0, lr=[6.96485956466642e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:20:37,241] [INFO] [timer.py:259:stop] epoch=0/micro_step=35630/global_step=35630, RunningAvgSamplesPerSec=2.6343243201797493, CurrSamplesPerSec=2.626328584706791, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:20:52,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=35640, skipped=0, lr=[6.963310137997256e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:20:52,535] [INFO] [timer.py:259:stop] epoch=0/micro_step=35640/global_step=35640, RunningAvgSamplesPerSec=2.6343224039291093, CurrSamplesPerSec=2.63640315561891, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:21:07,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=35650, skipped=0, lr=[6.9617604883935915e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:21:07,798] [INFO] [timer.py:259:stop] epoch=0/micro_step=35650/global_step=35650, RunningAvgSamplesPerSec=2.6343220347716008, CurrSamplesPerSec=2.6279893624534827, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:21:23,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=35660, skipped=0, lr=[6.960210616031391e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:21:23,133] [INFO] [timer.py:259:stop] epoch=0/micro_step=35660/global_step=35660, RunningAvgSamplesPerSec=2.6343191891963444, CurrSamplesPerSec=2.623900236794796, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:21:38,457] [INFO] [logging.py:96:log_dist] [Rank 0] step=35670, skipped=0, lr=[6.958660521086642e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:21:38,469] [INFO] [timer.py:259:stop] epoch=0/micro_step=35670/global_step=35670, RunningAvgSamplesPerSec=2.6343158629130716, CurrSamplesPerSec=2.623561725894483, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:21:53,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=35680, skipped=0, lr=[6.957110203735358e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:21:53,767] [INFO] [timer.py:259:stop] epoch=0/micro_step=35680/global_step=35680, RunningAvgSamplesPerSec=2.6343133295732555, CurrSamplesPerSec=2.6318691126850173, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:22:09,037] [INFO] [logging.py:96:log_dist] [Rank 0] step=35690, skipped=0, lr=[6.955559664153579e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:22:09,040] [INFO] [timer.py:259:stop] epoch=0/micro_step=35690/global_step=35690, RunningAvgSamplesPerSec=2.6343121870023696, CurrSamplesPerSec=2.604221944784011, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:22:24,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=35700, skipped=0, lr=[6.95400890251737e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:22:24,284] [INFO] [timer.py:259:stop] epoch=0/micro_step=35700/global_step=35700, RunningAvgSamplesPerSec=2.634313190438792, CurrSamplesPerSec=2.648232585207816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:22:39,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=35710, skipped=0, lr=[6.95245791900282e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:22:39,575] [INFO] [timer.py:259:stop] epoch=0/micro_step=35710/global_step=35710, RunningAvgSamplesPerSec=2.6343113034111294, CurrSamplesPerSec=2.5873383971029646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:22:54,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=35720, skipped=0, lr=[6.9509067137860406e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:22:54,850] [INFO] [timer.py:259:stop] epoch=0/micro_step=35720/global_step=35720, RunningAvgSamplesPerSec=2.6343105915765466, CurrSamplesPerSec=2.612513289338588, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:23:10,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=35730, skipped=0, lr=[6.949355287043177e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:23:10,144] [INFO] [timer.py:259:stop] epoch=0/micro_step=35730/global_step=35730, RunningAvgSamplesPerSec=2.6343093025870665, CurrSamplesPerSec=2.615499059682417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:23:25,411] [INFO] [logging.py:96:log_dist] [Rank 0] step=35740, skipped=0, lr=[6.94780363895039e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:23:25,412] [INFO] [timer.py:259:stop] epoch=0/micro_step=35740/global_step=35740, RunningAvgSamplesPerSec=2.634308946281061, CurrSamplesPerSec=2.6313258937246036, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:23:40,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=35750, skipped=0, lr=[6.946251769683871e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:23:40,687] [INFO] [timer.py:259:stop] epoch=0/micro_step=35750/global_step=35750, RunningAvgSamplesPerSec=2.63430836599116, CurrSamplesPerSec=2.6538744114146278, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:23:55,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=35760, skipped=0, lr=[6.944699679419837e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:23:55,951] [INFO] [timer.py:259:stop] epoch=0/micro_step=35760/global_step=35760, RunningAvgSamplesPerSec=2.6343085052593964, CurrSamplesPerSec=2.643168416114125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:24:11,259] [INFO] [logging.py:96:log_dist] [Rank 0] step=35770, skipped=0, lr=[6.943147368334527e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:24:11,260] [INFO] [timer.py:259:stop] epoch=0/micro_step=35770/global_step=35770, RunningAvgSamplesPerSec=2.6343067947322303, CurrSamplesPerSec=2.620114715557339, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:24:26,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=35780, skipped=0, lr=[6.941594836604203e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:24:26,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=35780/global_step=35780, RunningAvgSamplesPerSec=2.6343073659930094, CurrSamplesPerSec=2.6337171571758446, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:24:41,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=35790, skipped=0, lr=[6.9400420844051615e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:24:41,841] [INFO] [timer.py:259:stop] epoch=0/micro_step=35790/global_step=35790, RunningAvgSamplesPerSec=2.6343041778993666, CurrSamplesPerSec=2.617177193189094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:24:57,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=35800, skipped=0, lr=[6.938489111913715e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:24:57,004] [INFO] [timer.py:259:stop] epoch=0/micro_step=35800/global_step=35800, RunningAvgSamplesPerSec=2.634309179110885, CurrSamplesPerSec=2.6593354494167647, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:25:12,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=35810, skipped=0, lr=[6.936935919306203e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:25:12,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=35810/global_step=35810, RunningAvgSamplesPerSec=2.6343043505420223, CurrSamplesPerSec=2.63156445315728, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:25:27,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=35820, skipped=0, lr=[6.9353825067589944e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:25:27,615] [INFO] [timer.py:259:stop] epoch=0/micro_step=35820/global_step=35820, RunningAvgSamplesPerSec=2.6343049169382278, CurrSamplesPerSec=2.649685149233079, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:25:42,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=35830, skipped=0, lr=[6.933828874448475e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:25:42,866] [INFO] [timer.py:259:stop] epoch=0/micro_step=35830/global_step=35830, RunningAvgSamplesPerSec=2.6343048882836126, CurrSamplesPerSec=2.6302557978202845, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:25:58,147] [INFO] [logging.py:96:log_dist] [Rank 0] step=35840, skipped=0, lr=[6.9322750225510635e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:25:58,148] [INFO] [timer.py:259:stop] epoch=0/micro_step=35840/global_step=35840, RunningAvgSamplesPerSec=2.6343044480924194, CurrSamplesPerSec=2.650060992333379, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:26:13,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=35850, skipped=0, lr=[6.930720951243201e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:26:13,459] [INFO] [timer.py:259:stop] epoch=0/micro_step=35850/global_step=35850, RunningAvgSamplesPerSec=2.634302209282886, CurrSamplesPerSec=2.635331410689938, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:26:28,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=35860, skipped=0, lr=[6.929166660701351e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:26:28,766] [INFO] [timer.py:259:stop] epoch=0/micro_step=35860/global_step=35860, RunningAvgSamplesPerSec=2.634300271963646, CurrSamplesPerSec=2.640424572513079, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:26:44,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=35870, skipped=0, lr=[6.927612151102003e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:26:44,051] [INFO] [timer.py:259:stop] epoch=0/micro_step=35870/global_step=35870, RunningAvgSamplesPerSec=2.6342988262276754, CurrSamplesPerSec=2.6357905640719435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:26:59,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=35880, skipped=0, lr=[6.926057422621675e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:26:59,349] [INFO] [timer.py:259:stop] epoch=0/micro_step=35880/global_step=35880, RunningAvgSamplesPerSec=2.634296889610483, CurrSamplesPerSec=2.592964712080845, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:27:14,643] [INFO] [logging.py:96:log_dist] [Rank 0] step=35890, skipped=0, lr=[6.924502475436904e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:27:14,662] [INFO] [timer.py:259:stop] epoch=0/micro_step=35890/global_step=35890, RunningAvgSamplesPerSec=2.6342946168509287, CurrSamplesPerSec=2.6477398366703957, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:27:29,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=35900, skipped=0, lr=[6.922947309724255e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:27:29,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=35900/global_step=35900, RunningAvgSamplesPerSec=2.634294695525075, CurrSamplesPerSec=2.592701846861431, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:27:45,136] [INFO] [logging.py:96:log_dist] [Rank 0] step=35910, skipped=0, lr=[6.92139192566032e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:27:45,142] [INFO] [timer.py:259:stop] epoch=0/micro_step=35910/global_step=35910, RunningAvgSamplesPerSec=2.634297012175466, CurrSamplesPerSec=2.6416580977054958, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:28:00,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=35920, skipped=0, lr=[6.9198363234217106e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:28:00,448] [INFO] [timer.py:259:stop] epoch=0/micro_step=35920/global_step=35920, RunningAvgSamplesPerSec=2.6342953589796676, CurrSamplesPerSec=2.6171918909540497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:28:15,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=35930, skipped=0, lr=[6.918280503185069e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:28:15,748] [INFO] [timer.py:259:stop] epoch=0/micro_step=35930/global_step=35930, RunningAvgSamplesPerSec=2.6342937782228337, CurrSamplesPerSec=2.6144621662091447, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:28:31,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=35940, skipped=0, lr=[6.916724465127055e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:28:31,019] [INFO] [timer.py:259:stop] epoch=0/micro_step=35940/global_step=35940, RunningAvgSamplesPerSec=2.634293995628034, CurrSamplesPerSec=2.647821321863701, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:28:46,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=35950, skipped=0, lr=[6.91516820942436e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:28:46,275] [INFO] [timer.py:259:stop] epoch=0/micro_step=35950/global_step=35950, RunningAvgSamplesPerSec=2.6342948490356766, CurrSamplesPerSec=2.62877831916689, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:29:01,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=35960, skipped=0, lr=[6.913611736253697e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:29:01,577] [INFO] [timer.py:259:stop] epoch=0/micro_step=35960/global_step=35960, RunningAvgSamplesPerSec=2.6342931188316663, CurrSamplesPerSec=2.6344044830818465, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:29:16,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=35970, skipped=0, lr=[6.912055045791804e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:29:16,859] [INFO] [timer.py:259:stop] epoch=0/micro_step=35970/global_step=35970, RunningAvgSamplesPerSec=2.634292305990818, CurrSamplesPerSec=2.6477795339585706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:29:32,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=35980, skipped=0, lr=[6.910498138215442e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:29:32,129] [INFO] [timer.py:259:stop] epoch=0/micro_step=35980/global_step=35980, RunningAvgSamplesPerSec=2.6342915934621014, CurrSamplesPerSec=2.6279922440012693, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:29:47,383] [INFO] [logging.py:96:log_dist] [Rank 0] step=35990, skipped=0, lr=[6.908941013701402e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:29:47,389] [INFO] [timer.py:259:stop] epoch=0/micro_step=35990/global_step=35990, RunningAvgSamplesPerSec=2.634291718879623, CurrSamplesPerSec=2.6229038285679596, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:30:02,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=36000, skipped=0, lr=[6.907383672426494e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:30:02,678] [INFO] [timer.py:259:stop] epoch=0/micro_step=36000/global_step=36000, RunningAvgSamplesPerSec=2.634290290585629, CurrSamplesPerSec=2.6075028182725477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:30:17,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=36010, skipped=0, lr=[6.905826114567553e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:30:17,964] [INFO] [timer.py:259:stop] epoch=0/micro_step=36010/global_step=36010, RunningAvgSamplesPerSec=2.6342893042690805, CurrSamplesPerSec=2.6138756096791633, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:30:33,251] [INFO] [logging.py:96:log_dist] [Rank 0] step=36020, skipped=0, lr=[6.904268340301441e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:30:33,264] [INFO] [timer.py:259:stop] epoch=0/micro_step=36020/global_step=36020, RunningAvgSamplesPerSec=2.634287918874872, CurrSamplesPerSec=2.6234755734290394, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:30:48,507] [INFO] [logging.py:96:log_dist] [Rank 0] step=36030, skipped=0, lr=[6.9027103498050465e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:30:48,520] [INFO] [timer.py:259:stop] epoch=0/micro_step=36030/global_step=36030, RunningAvgSamplesPerSec=2.634288340248419, CurrSamplesPerSec=2.6132657071937047, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:31:03,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=36040, skipped=0, lr=[6.901152143255276e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:31:03,876] [INFO] [timer.py:259:stop] epoch=0/micro_step=36040/global_step=36040, RunningAvgSamplesPerSec=2.634283997626869, CurrSamplesPerSec=2.6409158489213875, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:31:19,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=36050, skipped=0, lr=[6.8995937208290674e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:31:19,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=36050/global_step=36050, RunningAvgSamplesPerSec=2.634281857747728, CurrSamplesPerSec=2.600531646609122, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:31:34,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=36060, skipped=0, lr=[6.8980350827033785e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:31:34,460] [INFO] [timer.py:259:stop] epoch=0/micro_step=36060/global_step=36060, RunningAvgSamplesPerSec=2.634281026836827, CurrSamplesPerSec=2.613584872990318, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:31:49,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=36070, skipped=0, lr=[6.8964762290551935e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:31:49,755] [INFO] [timer.py:259:stop] epoch=0/micro_step=36070/global_step=36070, RunningAvgSamplesPerSec=2.6342791908521503, CurrSamplesPerSec=2.61244169183396, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:32:05,048] [INFO] [logging.py:96:log_dist] [Rank 0] step=36080, skipped=0, lr=[6.89491716006152e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:32:05,050] [INFO] [timer.py:259:stop] epoch=0/micro_step=36080/global_step=36080, RunningAvgSamplesPerSec=2.6342777290620956, CurrSamplesPerSec=2.6195092602872028, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:32:20,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=36090, skipped=0, lr=[6.893357875899393e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:32:20,407] [INFO] [timer.py:259:stop] epoch=0/micro_step=36090/global_step=36090, RunningAvgSamplesPerSec=2.6342736903696564, CurrSamplesPerSec=2.604766970340064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:32:35,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=36100, skipped=0, lr=[6.8917983767458666e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:32:35,730] [INFO] [timer.py:259:stop] epoch=0/micro_step=36100/global_step=36100, RunningAvgSamplesPerSec=2.6342710527253934, CurrSamplesPerSec=2.6231699829383692, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:32:51,045] [INFO] [logging.py:96:log_dist] [Rank 0] step=36110, skipped=0, lr=[6.890238662778024e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:32:51,075] [INFO] [timer.py:259:stop] epoch=0/micro_step=36110/global_step=36110, RunningAvgSamplesPerSec=2.6342672100226325, CurrSamplesPerSec=2.6375628470646646, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:33:06,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=36120, skipped=0, lr=[6.888678734172971e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:33:06,409] [INFO] [timer.py:259:stop] epoch=0/micro_step=36120/global_step=36120, RunningAvgSamplesPerSec=2.634264440835087, CurrSamplesPerSec=2.623830886235577, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:33:21,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=36130, skipped=0, lr=[6.88711859110784e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:33:21,780] [INFO] [timer.py:259:stop] epoch=0/micro_step=36130/global_step=36130, RunningAvgSamplesPerSec=2.6342600626036923, CurrSamplesPerSec=2.649251262588777, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:33:37,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=36140, skipped=0, lr=[6.885558233759783e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:33:37,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=36140/global_step=36140, RunningAvgSamplesPerSec=2.6342604480457488, CurrSamplesPerSec=2.6305754153202474, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:33:52,367] [INFO] [logging.py:96:log_dist] [Rank 0] step=36150, skipped=0, lr=[6.88399766230598e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:33:52,369] [INFO] [timer.py:259:stop] epoch=0/micro_step=36150/global_step=36150, RunningAvgSamplesPerSec=2.634256933686934, CurrSamplesPerSec=2.6261645544383634, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:34:07,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=36160, skipped=0, lr=[6.882436876923635e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:34:07,680] [INFO] [timer.py:259:stop] epoch=0/micro_step=36160/global_step=36160, RunningAvgSamplesPerSec=2.6342547150754214, CurrSamplesPerSec=2.630560566851562, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:34:23,027] [INFO] [logging.py:96:log_dist] [Rank 0] step=36170, skipped=0, lr=[6.8808758777899735e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:34:23,033] [INFO] [timer.py:259:stop] epoch=0/micro_step=36170/global_step=36170, RunningAvgSamplesPerSec=2.6342502669979595, CurrSamplesPerSec=2.621388311832262, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:34:38,401] [INFO] [logging.py:96:log_dist] [Rank 0] step=36180, skipped=0, lr=[6.87931466508225e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:34:38,402] [INFO] [timer.py:259:stop] epoch=0/micro_step=36180/global_step=36180, RunningAvgSamplesPerSec=2.6342451916481937, CurrSamplesPerSec=2.6226889759999827, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:34:53,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=36190, skipped=0, lr=[6.87775323897774e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:34:53,773] [INFO] [timer.py:259:stop] epoch=0/micro_step=36190/global_step=36190, RunningAvgSamplesPerSec=2.6342406493249326, CurrSamplesPerSec=2.6269396647097754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:35:09,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=36200, skipped=0, lr=[6.8761915996537435e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:35:09,173] [INFO] [timer.py:259:stop] epoch=0/micro_step=36200/global_step=36200, RunningAvgSamplesPerSec=2.6342342646294608, CurrSamplesPerSec=2.5752634364744553, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:35:24,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=36210, skipped=0, lr=[6.874629747287587e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:35:24,531] [INFO] [timer.py:259:stop] epoch=0/micro_step=36210/global_step=36210, RunningAvgSamplesPerSec=2.634229356666648, CurrSamplesPerSec=2.6114612823317507, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:35:39,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=36220, skipped=0, lr=[6.8730676820566154e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:35:39,921] [INFO] [timer.py:259:stop] epoch=0/micro_step=36220/global_step=36220, RunningAvgSamplesPerSec=2.6342228404549854, CurrSamplesPerSec=2.6040537930383443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:35:55,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=36230, skipped=0, lr=[6.871505404138204e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:35:55,237] [INFO] [timer.py:259:stop] epoch=0/micro_step=36230/global_step=36230, RunningAvgSamplesPerSec=2.6342201147576962, CurrSamplesPerSec=2.6248477108767605, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:36:10,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=36240, skipped=0, lr=[6.869942913709752e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:36:10,588] [INFO] [timer.py:259:stop] epoch=0/micro_step=36240/global_step=36240, RunningAvgSamplesPerSec=2.63421612044169, CurrSamplesPerSec=2.648217118725769, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:36:25,944] [INFO] [logging.py:96:log_dist] [Rank 0] step=36250, skipped=0, lr=[6.868380210948677e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:36:25,946] [INFO] [timer.py:259:stop] epoch=0/micro_step=36250/global_step=36250, RunningAvgSamplesPerSec=2.6342121991312504, CurrSamplesPerSec=2.6058637773232416, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:36:41,290] [INFO] [logging.py:96:log_dist] [Rank 0] step=36260, skipped=0, lr=[6.866817296032426e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:36:41,291] [INFO] [timer.py:259:stop] epoch=0/micro_step=36260/global_step=36260, RunningAvgSamplesPerSec=2.634208675400968, CurrSamplesPerSec=2.638084999653208, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:36:56,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=36270, skipped=0, lr=[6.86525416913847e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:36:56,592] [INFO] [timer.py:259:stop] epoch=0/micro_step=36270/global_step=36270, RunningAvgSamplesPerSec=2.634206648124484, CurrSamplesPerSec=2.627425112530684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:37:11,912] [INFO] [logging.py:96:log_dist] [Rank 0] step=36280, skipped=0, lr=[6.863690830444301e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:37:11,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=36280/global_step=36280, RunningAvgSamplesPerSec=2.6342036351490408, CurrSamplesPerSec=2.626208129403269, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:37:27,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=36290, skipped=0, lr=[6.862127280127437e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:37:27,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=36290/global_step=36290, RunningAvgSamplesPerSec=2.6342000988204934, CurrSamplesPerSec=2.6223462688147263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:37:42,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=36300, skipped=0, lr=[6.860563518365418e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:37:42,595] [INFO] [timer.py:259:stop] epoch=0/micro_step=36300/global_step=36300, RunningAvgSamplesPerSec=2.6341963807712405, CurrSamplesPerSec=2.6285748580768598, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:37:57,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=36310, skipped=0, lr=[6.858999545335811e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:37:57,908] [INFO] [timer.py:259:stop] epoch=0/micro_step=36310/global_step=36310, RunningAvgSamplesPerSec=2.6341944607436893, CurrSamplesPerSec=2.6368510784749444, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:38:13,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=36320, skipped=0, lr=[6.857435361216206e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:38:13,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=36320/global_step=36320, RunningAvgSamplesPerSec=2.634191664945894, CurrSamplesPerSec=2.6137904993869583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:38:28,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=36330, skipped=0, lr=[6.855870966184217e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:38:28,505] [INFO] [timer.py:259:stop] epoch=0/micro_step=36330/global_step=36330, RunningAvgSamplesPerSec=2.6341909228040903, CurrSamplesPerSec=2.610125028756943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:38:43,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=36340, skipped=0, lr=[6.854306360417482e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:38:43,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=36340/global_step=36340, RunningAvgSamplesPerSec=2.6341877546879466, CurrSamplesPerSec=2.6357193413424307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:38:59,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=36350, skipped=0, lr=[6.852741544093659e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:38:59,188] [INFO] [timer.py:259:stop] epoch=0/micro_step=36350/global_step=36350, RunningAvgSamplesPerSec=2.6341845001492206, CurrSamplesPerSec=2.5670002639677754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:39:14,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=36360, skipped=0, lr=[6.8511765173904364e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:39:14,464] [INFO] [timer.py:259:stop] epoch=0/micro_step=36360/global_step=36360, RunningAvgSamplesPerSec=2.634184017283653, CurrSamplesPerSec=2.6350251213921148, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:39:29,732] [INFO] [logging.py:96:log_dist] [Rank 0] step=36370, skipped=0, lr=[6.849611280485523e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:39:29,734] [INFO] [timer.py:259:stop] epoch=0/micro_step=36370/global_step=36370, RunningAvgSamplesPerSec=2.6341833260113576, CurrSamplesPerSec=2.6236839900577005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:39:45,039] [INFO] [logging.py:96:log_dist] [Rank 0] step=36380, skipped=0, lr=[6.8480458335566515e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:39:45,041] [INFO] [timer.py:259:stop] epoch=0/micro_step=36380/global_step=36380, RunningAvgSamplesPerSec=2.6341815480346256, CurrSamplesPerSec=2.6131651698312117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:40:00,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=36390, skipped=0, lr=[6.846480176781581e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:40:00,342] [INFO] [timer.py:259:stop] epoch=0/micro_step=36390/global_step=36390, RunningAvgSamplesPerSec=2.63417942372264, CurrSamplesPerSec=2.616145088178963, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:40:15,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=36400, skipped=0, lr=[6.844914310338089e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:40:15,649] [INFO] [timer.py:259:stop] epoch=0/micro_step=36400/global_step=36400, RunningAvgSamplesPerSec=2.6341770419711037, CurrSamplesPerSec=2.6085187860311883, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:40:30,938] [INFO] [logging.py:96:log_dist] [Rank 0] step=36410, skipped=0, lr=[6.843348234403982e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:40:30,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=36410/global_step=36410, RunningAvgSamplesPerSec=2.6341749447786507, CurrSamplesPerSec=2.6266365565175995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:40:46,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=36420, skipped=0, lr=[6.841781949157087e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:40:46,199] [INFO] [timer.py:259:stop] epoch=0/micro_step=36420/global_step=36420, RunningAvgSamplesPerSec=2.634174982612094, CurrSamplesPerSec=2.606547574209195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:41:01,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=36430, skipped=0, lr=[6.8402154547752585e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:41:01,522] [INFO] [timer.py:259:stop] epoch=0/micro_step=36430/global_step=36430, RunningAvgSamplesPerSec=2.6341720187944335, CurrSamplesPerSec=2.642565579980163, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:41:16,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=36440, skipped=0, lr=[6.838648751436372e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:41:16,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=36440/global_step=36440, RunningAvgSamplesPerSec=2.6341691781961747, CurrSamplesPerSec=2.6350495391430795, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:41:32,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=36450, skipped=0, lr=[6.837081839318324e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:41:32,103] [INFO] [timer.py:259:stop] epoch=0/micro_step=36450/global_step=36450, RunningAvgSamplesPerSec=2.6341687529186526, CurrSamplesPerSec=2.631389037600381, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:41:47,367] [INFO] [logging.py:96:log_dist] [Rank 0] step=36460, skipped=0, lr=[6.8355147185990415e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:41:47,369] [INFO] [timer.py:259:stop] epoch=0/micro_step=36460/global_step=36460, RunningAvgSamplesPerSec=2.6341681096346448, CurrSamplesPerSec=2.640295757007777, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:42:02,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=36470, skipped=0, lr=[6.833947389456473e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:42:02,660] [INFO] [timer.py:259:stop] epoch=0/micro_step=36470/global_step=36470, RunningAvgSamplesPerSec=2.634166383221505, CurrSamplesPerSec=2.629778784323833, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:42:17,942] [INFO] [logging.py:96:log_dist] [Rank 0] step=36480, skipped=0, lr=[6.832379852068584e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:42:17,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=36480/global_step=36480, RunningAvgSamplesPerSec=2.6341652594515166, CurrSamplesPerSec=2.622336021773319, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:42:33,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=36490, skipped=0, lr=[6.830812106613372e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:42:33,279] [INFO] [timer.py:259:stop] epoch=0/micro_step=36490/global_step=36490, RunningAvgSamplesPerSec=2.6341612179636473, CurrSamplesPerSec=2.6334703532851838, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:42:48,523] [INFO] [logging.py:96:log_dist] [Rank 0] step=36500, skipped=0, lr=[6.829244153268853e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:42:48,536] [INFO] [timer.py:259:stop] epoch=0/micro_step=36500/global_step=36500, RunningAvgSamplesPerSec=2.6341607811630032, CurrSamplesPerSec=2.6046545502553857, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:43:03,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=36510, skipped=0, lr=[6.8276759922130695e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:43:03,910] [INFO] [timer.py:259:stop] epoch=0/micro_step=36510/global_step=36510, RunningAvgSamplesPerSec=2.6341551815883464, CurrSamplesPerSec=2.6330938287770747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:43:19,165] [INFO] [logging.py:96:log_dist] [Rank 0] step=36520, skipped=0, lr=[6.82610762362409e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:43:19,167] [INFO] [timer.py:259:stop] epoch=0/micro_step=36520/global_step=36520, RunningAvgSamplesPerSec=2.6341547325703916, CurrSamplesPerSec=2.5820610696147486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:43:34,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=36530, skipped=0, lr=[6.824539047679999e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:43:34,408] [INFO] [timer.py:259:stop] epoch=0/micro_step=36530/global_step=36530, RunningAvgSamplesPerSec=2.6341553148121535, CurrSamplesPerSec=2.6377482103767766, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:43:49,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=36540, skipped=0, lr=[6.822970264558912e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:43:49,668] [INFO] [timer.py:259:stop] epoch=0/micro_step=36540/global_step=36540, RunningAvgSamplesPerSec=2.6341551888060986, CurrSamplesPerSec=2.645573752870039, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:44:04,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=36550, skipped=0, lr=[6.821401274438963e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:44:04,896] [INFO] [timer.py:259:stop] epoch=0/micro_step=36550/global_step=36550, RunningAvgSamplesPerSec=2.634156348148016, CurrSamplesPerSec=2.6389186325124623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:44:20,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=36560, skipped=0, lr=[6.819832077498309e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:44:20,236] [INFO] [timer.py:259:stop] epoch=0/micro_step=36560/global_step=36560, RunningAvgSamplesPerSec=2.634152832625231, CurrSamplesPerSec=2.632992173222371, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:44:35,550] [INFO] [logging.py:96:log_dist] [Rank 0] step=36570, skipped=0, lr=[6.818262673915137e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:44:35,552] [INFO] [timer.py:259:stop] epoch=0/micro_step=36570/global_step=36570, RunningAvgSamplesPerSec=2.6341504094473684, CurrSamplesPerSec=2.6319186574873616, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:44:50,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=36580, skipped=0, lr=[6.816693063867652e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:44:50,884] [INFO] [timer.py:259:stop] epoch=0/micro_step=36580/global_step=36580, RunningAvgSamplesPerSec=2.6341469627491074, CurrSamplesPerSec=2.6349067637923005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:45:06,154] [INFO] [logging.py:96:log_dist] [Rank 0] step=36590, skipped=0, lr=[6.8151232475340826e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:45:06,155] [INFO] [timer.py:259:stop] epoch=0/micro_step=36590/global_step=36590, RunningAvgSamplesPerSec=2.6341458132848388, CurrSamplesPerSec=2.6095619288130356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:45:21,474] [INFO] [logging.py:96:log_dist] [Rank 0] step=36600, skipped=0, lr=[6.813553225092683e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:45:21,475] [INFO] [timer.py:259:stop] epoch=0/micro_step=36600/global_step=36600, RunningAvgSamplesPerSec=2.634143366993107, CurrSamplesPerSec=2.6279663102986754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:45:36,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=36610, skipped=0, lr=[6.811982996721729e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:45:36,799] [INFO] [timer.py:259:stop] epoch=0/micro_step=36610/global_step=36610, RunningAvgSamplesPerSec=2.6341401333758863, CurrSamplesPerSec=2.6340376164999064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:45:52,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=36620, skipped=0, lr=[6.810412562599521e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:45:52,117] [INFO] [timer.py:259:stop] epoch=0/micro_step=36620/global_step=36620, RunningAvgSamplesPerSec=2.634138240944533, CurrSamplesPerSec=2.6251553354610038, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:46:07,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=36630, skipped=0, lr=[6.808841922904382e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:46:07,404] [INFO] [timer.py:259:stop] epoch=0/micro_step=36630/global_step=36630, RunningAvgSamplesPerSec=2.6341364727710292, CurrSamplesPerSec=2.617342552563397, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:46:22,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=36640, skipped=0, lr=[6.80727107781466e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:46:22,709] [INFO] [timer.py:259:stop] epoch=0/micro_step=36640/global_step=36640, RunningAvgSamplesPerSec=2.6341339590547483, CurrSamplesPerSec=2.638252181932026, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:46:38,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=36650, skipped=0, lr=[6.805700027508724e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:46:38,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=36650/global_step=36650, RunningAvgSamplesPerSec=2.634130844938228, CurrSamplesPerSec=2.63120374162601, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:46:53,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=36660, skipped=0, lr=[6.804128772164965e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:46:53,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=36660/global_step=36660, RunningAvgSamplesPerSec=2.6341239636667844, CurrSamplesPerSec=2.615754333315269, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:47:08,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=36670, skipped=0, lr=[6.802557311961803e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:47:08,808] [INFO] [timer.py:259:stop] epoch=0/micro_step=36670/global_step=36670, RunningAvgSamplesPerSec=2.6341194818384364, CurrSamplesPerSec=2.591571252765755, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:47:24,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=36680, skipped=0, lr=[6.800985647077678e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:47:24,143] [INFO] [timer.py:259:stop] epoch=0/micro_step=36680/global_step=36680, RunningAvgSamplesPerSec=2.6341154511861853, CurrSamplesPerSec=2.5968674684989743, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:47:39,592] [INFO] [logging.py:96:log_dist] [Rank 0] step=36690, skipped=0, lr=[6.799413777691051e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:47:39,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=36690/global_step=36690, RunningAvgSamplesPerSec=2.63410670992314, CurrSamplesPerSec=2.575128252025377, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:47:54,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=36700, skipped=0, lr=[6.797841703980408e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:47:54,912] [INFO] [timer.py:259:stop] epoch=0/micro_step=36700/global_step=36700, RunningAvgSamplesPerSec=2.6341040114367877, CurrSamplesPerSec=2.6105888450462853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:48:10,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=36710, skipped=0, lr=[6.7962694261242575e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:48:10,225] [INFO] [timer.py:259:stop] epoch=0/micro_step=36710/global_step=36710, RunningAvgSamplesPerSec=2.634101770568143, CurrSamplesPerSec=2.626577752582816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:48:25,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=36720, skipped=0, lr=[6.794696944301136e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:48:25,507] [INFO] [timer.py:259:stop] epoch=0/micro_step=36720/global_step=36720, RunningAvgSamplesPerSec=2.634100729868371, CurrSamplesPerSec=2.648728444382859, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:48:40,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=36730, skipped=0, lr=[6.793124258689597e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:48:40,805] [INFO] [timer.py:259:stop] epoch=0/micro_step=36730/global_step=36730, RunningAvgSamplesPerSec=2.6340990895772967, CurrSamplesPerSec=2.6366753718138245, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:48:56,138] [INFO] [logging.py:96:log_dist] [Rank 0] step=36740, skipped=0, lr=[6.79155136946822e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:48:56,139] [INFO] [timer.py:259:stop] epoch=0/micro_step=36740/global_step=36740, RunningAvgSamplesPerSec=2.6340956807167846, CurrSamplesPerSec=2.629130949811204, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:49:11,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=36750, skipped=0, lr=[6.789978276815608e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:49:11,454] [INFO] [timer.py:259:stop] epoch=0/micro_step=36750/global_step=36750, RunningAvgSamplesPerSec=2.6340929344563238, CurrSamplesPerSec=2.6275967077620392, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:49:26,814] [INFO] [logging.py:96:log_dist] [Rank 0] step=36760, skipped=0, lr=[6.788404980910383e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:49:26,821] [INFO] [timer.py:259:stop] epoch=0/micro_step=36760/global_step=36760, RunningAvgSamplesPerSec=2.63408835294442, CurrSamplesPerSec=2.637689322563614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:49:42,149] [INFO] [logging.py:96:log_dist] [Rank 0] step=36770, skipped=0, lr=[6.786831481931197e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:49:42,163] [INFO] [timer.py:259:stop] epoch=0/micro_step=36770/global_step=36770, RunningAvgSamplesPerSec=2.6340848625972813, CurrSamplesPerSec=2.6074444626845272, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:49:57,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=36780, skipped=0, lr=[6.785257780056718e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:49:57,462] [INFO] [timer.py:259:stop] epoch=0/micro_step=36780/global_step=36780, RunningAvgSamplesPerSec=2.6340825084735706, CurrSamplesPerSec=2.6357731721200035, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:50:12,768] [INFO] [logging.py:96:log_dist] [Rank 0] step=36790, skipped=0, lr=[6.783683875465643e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:50:12,776] [INFO] [timer.py:259:stop] epoch=0/micro_step=36790/global_step=36790, RunningAvgSamplesPerSec=2.634079879627766, CurrSamplesPerSec=2.6387575911912595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:50:28,023] [INFO] [logging.py:96:log_dist] [Rank 0] step=36800, skipped=0, lr=[6.782109768336687e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:50:28,044] [INFO] [timer.py:259:stop] epoch=0/micro_step=36800/global_step=36800, RunningAvgSamplesPerSec=2.6340789229387553, CurrSamplesPerSec=2.6603411737231166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:50:43,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=36810, skipped=0, lr=[6.780535458848594e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:50:43,294] [INFO] [timer.py:259:stop] epoch=0/micro_step=36810/global_step=36810, RunningAvgSamplesPerSec=2.6340789413440246, CurrSamplesPerSec=2.6368465197515256, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:50:58,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=36820, skipped=0, lr=[6.778960947180123e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:50:58,659] [INFO] [timer.py:259:stop] epoch=0/micro_step=36820/global_step=36820, RunningAvgSamplesPerSec=2.634074260350233, CurrSamplesPerSec=2.63481572671967, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:51:13,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=36830, skipped=0, lr=[6.777386233510064e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:51:13,951] [INFO] [timer.py:259:stop] epoch=0/micro_step=36830/global_step=36830, RunningAvgSamplesPerSec=2.634072549848256, CurrSamplesPerSec=2.6504093073897983, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:51:29,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=36840, skipped=0, lr=[6.775811318017222e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:51:29,143] [INFO] [timer.py:259:stop] epoch=0/micro_step=36840/global_step=36840, RunningAvgSamplesPerSec=2.634075249987494, CurrSamplesPerSec=2.6095887182578608, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:51:44,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=36850, skipped=0, lr=[6.774236200880433e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:51:44,399] [INFO] [timer.py:259:stop] epoch=0/micro_step=36850/global_step=36850, RunningAvgSamplesPerSec=2.6340748126016558, CurrSamplesPerSec=2.64836886526363, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:51:59,633] [INFO] [logging.py:96:log_dist] [Rank 0] step=36860, skipped=0, lr=[6.772660882278551e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:51:59,634] [INFO] [timer.py:259:stop] epoch=0/micro_step=36860/global_step=36860, RunningAvgSamplesPerSec=2.6340754669980355, CurrSamplesPerSec=2.633507970320693, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:52:14,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=36870, skipped=0, lr=[6.7710853623904546e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:52:14,931] [INFO] [timer.py:259:stop] epoch=0/micro_step=36870/global_step=36870, RunningAvgSamplesPerSec=2.6340736718745403, CurrSamplesPerSec=2.6159615249046744, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:52:30,238] [INFO] [logging.py:96:log_dist] [Rank 0] step=36880, skipped=0, lr=[6.7695096413950425e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:52:30,239] [INFO] [timer.py:259:stop] epoch=0/micro_step=36880/global_step=36880, RunningAvgSamplesPerSec=2.6340710421354285, CurrSamplesPerSec=2.6307276217895996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:52:45,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=36890, skipped=0, lr=[6.767933719471243e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:52:45,550] [INFO] [timer.py:259:stop] epoch=0/micro_step=36890/global_step=36890, RunningAvgSamplesPerSec=2.6340686843966163, CurrSamplesPerSec=2.6390273880282193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:53:00,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=36900, skipped=0, lr=[6.766357596797994e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:53:00,831] [INFO] [timer.py:259:stop] epoch=0/micro_step=36900/global_step=36900, RunningAvgSamplesPerSec=2.634067626888273, CurrSamplesPerSec=2.6251623184312853, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:53:16,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=36910, skipped=0, lr=[6.764781273554273e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:53:16,168] [INFO] [timer.py:259:stop] epoch=0/micro_step=36910/global_step=36910, RunningAvgSamplesPerSec=2.634064355774218, CurrSamplesPerSec=2.6302347676504034, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:53:31,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=36920, skipped=0, lr=[6.763204749919068e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:53:31,453] [INFO] [timer.py:259:stop] epoch=0/micro_step=36920/global_step=36920, RunningAvgSamplesPerSec=2.6340630060079007, CurrSamplesPerSec=2.6343755271007763, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:53:46,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=36930, skipped=0, lr=[6.761628026071394e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:53:46,746] [INFO] [timer.py:259:stop] epoch=0/micro_step=36930/global_step=36930, RunningAvgSamplesPerSec=2.6340620191919495, CurrSamplesPerSec=2.6373733638533117, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:54:02,042] [INFO] [logging.py:96:log_dist] [Rank 0] step=36940, skipped=0, lr=[6.760051102190291e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:54:02,043] [INFO] [timer.py:259:stop] epoch=0/micro_step=36940/global_step=36940, RunningAvgSamplesPerSec=2.6340599968555347, CurrSamplesPerSec=2.6355313649504617, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:54:17,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=36950, skipped=0, lr=[6.758473978454815e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:54:17,311] [INFO] [timer.py:259:stop] epoch=0/micro_step=36950/global_step=36950, RunningAvgSamplesPerSec=2.6340595112178424, CurrSamplesPerSec=2.648249305932174, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:54:32,646] [INFO] [logging.py:96:log_dist] [Rank 0] step=36960, skipped=0, lr=[6.756896655044053e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:54:32,647] [INFO] [timer.py:259:stop] epoch=0/micro_step=36960/global_step=36960, RunningAvgSamplesPerSec=2.6340559276909703, CurrSamplesPerSec=2.626664108917354, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:54:47,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=36970, skipped=0, lr=[6.755319132137108e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:54:47,920] [INFO] [timer.py:259:stop] epoch=0/micro_step=36970/global_step=36970, RunningAvgSamplesPerSec=2.6340547265826113, CurrSamplesPerSec=2.6388605225526085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:55:03,223] [INFO] [logging.py:96:log_dist] [Rank 0] step=36980, skipped=0, lr=[6.753741409913109e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:55:03,224] [INFO] [timer.py:259:stop] epoch=0/micro_step=36980/global_step=36980, RunningAvgSamplesPerSec=2.6340530270184876, CurrSamplesPerSec=2.6484098356297987, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:55:18,568] [INFO] [logging.py:96:log_dist] [Rank 0] step=36990, skipped=0, lr=[6.7521634885512076e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:55:18,569] [INFO] [timer.py:259:stop] epoch=0/micro_step=36990/global_step=36990, RunningAvgSamplesPerSec=2.634048753056704, CurrSamplesPerSec=2.594079676505696, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:55:33,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=37000, skipped=0, lr=[6.7505853682305756e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:55:33,893] [INFO] [timer.py:259:stop] epoch=0/micro_step=37000/global_step=37000, RunningAvgSamplesPerSec=2.6340454467382903, CurrSamplesPerSec=2.629099637656119, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:55:49,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=37010, skipped=0, lr=[6.749007049130411e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:55:49,208] [INFO] [timer.py:259:stop] epoch=0/micro_step=37010/global_step=37010, RunningAvgSamplesPerSec=2.634042862962287, CurrSamplesPerSec=2.6080662461214192, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:56:04,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=37020, skipped=0, lr=[6.747428531429931e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:56:04,582] [INFO] [timer.py:259:stop] epoch=0/micro_step=37020/global_step=37020, RunningAvgSamplesPerSec=2.6340375082900156, CurrSamplesPerSec=2.629800631636112, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:56:19,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=37030, skipped=0, lr=[6.745849815308378e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:56:19,887] [INFO] [timer.py:259:stop] epoch=0/micro_step=37030/global_step=37030, RunningAvgSamplesPerSec=2.6340352669645433, CurrSamplesPerSec=2.6000427876900685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:56:35,190] [INFO] [logging.py:96:log_dist] [Rank 0] step=37040, skipped=0, lr=[6.744270900945016e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:56:35,200] [INFO] [timer.py:259:stop] epoch=0/micro_step=37040/global_step=37040, RunningAvgSamplesPerSec=2.6340328313690318, CurrSamplesPerSec=2.641497137758288, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:56:50,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=37050, skipped=0, lr=[6.742691788519128e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:56:50,435] [INFO] [timer.py:259:stop] epoch=0/micro_step=37050/global_step=37050, RunningAvgSamplesPerSec=2.6340334323602432, CurrSamplesPerSec=2.6413977433501623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:57:05,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=37060, skipped=0, lr=[6.7411124782100255e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:57:05,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=37060/global_step=37060, RunningAvgSamplesPerSec=2.634031597155538, CurrSamplesPerSec=2.612147613707499, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:57:21,073] [INFO] [logging.py:96:log_dist] [Rank 0] step=37070, skipped=0, lr=[6.739532970197041e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:57:21,075] [INFO] [timer.py:259:stop] epoch=0/micro_step=37070/global_step=37070, RunningAvgSamplesPerSec=2.6340277225929474, CurrSamplesPerSec=2.629327903885261, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:57:36,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=37080, skipped=0, lr=[6.737953264659525e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:57:36,374] [INFO] [timer.py:259:stop] epoch=0/micro_step=37080/global_step=37080, RunningAvgSamplesPerSec=2.6340260299447342, CurrSamplesPerSec=2.622345449048467, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:57:51,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=37090, skipped=0, lr=[6.736373361776854e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:57:51,700] [INFO] [timer.py:259:stop] epoch=0/micro_step=37090/global_step=37090, RunningAvgSamplesPerSec=2.6340225030307205, CurrSamplesPerSec=2.6204621604991325, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:58:07,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=37100, skipped=0, lr=[6.734793261728429e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:58:07,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=37100/global_step=37100, RunningAvgSamplesPerSec=2.634019494624258, CurrSamplesPerSec=2.6370350982888544, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:58:22,319] [INFO] [logging.py:96:log_dist] [Rank 0] step=37110, skipped=0, lr=[6.733212964693669e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:58:22,321] [INFO] [timer.py:259:stop] epoch=0/micro_step=37110/global_step=37110, RunningAvgSamplesPerSec=2.6340173790716657, CurrSamplesPerSec=2.6381098889725996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:58:37,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=37120, skipped=0, lr=[6.731632470852017e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:58:37,574] [INFO] [timer.py:259:stop] epoch=0/micro_step=37120/global_step=37120, RunningAvgSamplesPerSec=2.634017770132052, CurrSamplesPerSec=2.629026716186068, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:58:52,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=37130, skipped=0, lr=[6.73005178038294e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:58:52,881] [INFO] [timer.py:259:stop] epoch=0/micro_step=37130/global_step=37130, RunningAvgSamplesPerSec=2.6340156331636253, CurrSamplesPerSec=2.637096029427472, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:59:08,131] [INFO] [logging.py:96:log_dist] [Rank 0] step=37140, skipped=0, lr=[6.728470893465923e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:59:08,132] [INFO] [timer.py:259:stop] epoch=0/micro_step=37140/global_step=37140, RunningAvgSamplesPerSec=2.6340159529726908, CurrSamplesPerSec=2.6504708581498777, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:59:23,385] [INFO] [logging.py:96:log_dist] [Rank 0] step=37150, skipped=0, lr=[6.726889810280481e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:59:23,406] [INFO] [timer.py:259:stop] epoch=0/micro_step=37150/global_step=37150, RunningAvgSamplesPerSec=2.6340153972839, CurrSamplesPerSec=2.6531953513099804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:59:38,666] [INFO] [logging.py:96:log_dist] [Rank 0] step=37160, skipped=0, lr=[6.725308531006143e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:59:38,695] [INFO] [timer.py:259:stop] epoch=0/micro_step=37160/global_step=37160, RunningAvgSamplesPerSec=2.634013742585246, CurrSamplesPerSec=2.591413536602761, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 15:59:53,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=37170, skipped=0, lr=[6.723727055822464e-06], mom=[(0.9, 0.95)] +[2024-11-01 15:59:54,020] [INFO] [timer.py:259:stop] epoch=0/micro_step=37170/global_step=37170, RunningAvgSamplesPerSec=2.634010498291278, CurrSamplesPerSec=2.5985885827572295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:00:09,422] [INFO] [logging.py:96:log_dist] [Rank 0] step=37180, skipped=0, lr=[6.722145384909021e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:00:09,423] [INFO] [timer.py:259:stop] epoch=0/micro_step=37180/global_step=37180, RunningAvgSamplesPerSec=2.634003877763761, CurrSamplesPerSec=2.6171506559760354, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:00:24,705] [INFO] [logging.py:96:log_dist] [Rank 0] step=37190, skipped=0, lr=[6.7205635184454145e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:00:24,716] [INFO] [timer.py:259:stop] epoch=0/micro_step=37190/global_step=37190, RunningAvgSamplesPerSec=2.634002320598605, CurrSamplesPerSec=2.6362154959104043, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:00:40,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=37200, skipped=0, lr=[6.718981456611265e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:00:40,051] [INFO] [timer.py:259:stop] epoch=0/micro_step=37200/global_step=37200, RunningAvgSamplesPerSec=2.6339988720621754, CurrSamplesPerSec=2.608374815892076, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:00:55,356] [INFO] [logging.py:96:log_dist] [Rank 0] step=37210, skipped=0, lr=[6.717399199586215e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:00:55,364] [INFO] [timer.py:259:stop] epoch=0/micro_step=37210/global_step=37210, RunningAvgSamplesPerSec=2.633996250954211, CurrSamplesPerSec=2.6357238961692246, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:01:10,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=37220, skipped=0, lr=[6.715816747549934e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:01:10,652] [INFO] [timer.py:259:stop] epoch=0/micro_step=37220/global_step=37220, RunningAvgSamplesPerSec=2.63399447508057, CurrSamplesPerSec=2.644166947790436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:01:25,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=37230, skipped=0, lr=[6.714234100682108e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:01:25,959] [INFO] [timer.py:259:stop] epoch=0/micro_step=37230/global_step=37230, RunningAvgSamplesPerSec=2.6339928673513024, CurrSamplesPerSec=2.638573745878049, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:01:41,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=37240, skipped=0, lr=[6.712651259162444e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:01:41,236] [INFO] [timer.py:259:stop] epoch=0/micro_step=37240/global_step=37240, RunningAvgSamplesPerSec=2.6339922561376783, CurrSamplesPerSec=2.6288767661014054, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:01:56,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=37250, skipped=0, lr=[6.7110682231706784e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:01:56,609] [INFO] [timer.py:259:stop] epoch=0/micro_step=37250/global_step=37250, RunningAvgSamplesPerSec=2.633986500229136, CurrSamplesPerSec=2.639231224988515, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:02:11,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=37260, skipped=0, lr=[6.709484992886564e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:02:11,891] [INFO] [timer.py:259:stop] epoch=0/micro_step=37260/global_step=37260, RunningAvgSamplesPerSec=2.6339855088250084, CurrSamplesPerSec=2.6048603915337623, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:02:27,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=37270, skipped=0, lr=[6.707901568489876e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:02:27,208] [INFO] [timer.py:259:stop] epoch=0/micro_step=37270/global_step=37270, RunningAvgSamplesPerSec=2.633983273315553, CurrSamplesPerSec=2.6366413935108106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:02:42,560] [INFO] [logging.py:96:log_dist] [Rank 0] step=37280, skipped=0, lr=[6.706317950160415e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:02:42,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=37280/global_step=37280, RunningAvgSamplesPerSec=2.633979640958152, CurrSamplesPerSec=2.6368274561698297, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:02:57,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=37290, skipped=0, lr=[6.704734138077999e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:02:57,857] [INFO] [timer.py:259:stop] epoch=0/micro_step=37290/global_step=37290, RunningAvgSamplesPerSec=2.633979114465125, CurrSamplesPerSec=2.6386231284456767, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:03:13,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=37300, skipped=0, lr=[6.703150132422472e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:03:13,175] [INFO] [timer.py:259:stop] epoch=0/micro_step=37300/global_step=37300, RunningAvgSamplesPerSec=2.6339766351477016, CurrSamplesPerSec=2.6211884501537823, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:03:28,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=37310, skipped=0, lr=[6.701565933373697e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:03:28,467] [INFO] [timer.py:259:stop] epoch=0/micro_step=37310/global_step=37310, RunningAvgSamplesPerSec=2.6339757139848157, CurrSamplesPerSec=2.6063511837292483, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:03:43,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=37320, skipped=0, lr=[6.699981541111562e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:03:43,773] [INFO] [timer.py:259:stop] epoch=0/micro_step=37320/global_step=37320, RunningAvgSamplesPerSec=2.633973539495411, CurrSamplesPerSec=2.636203897511491, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:03:59,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=37330, skipped=0, lr=[6.698396955815976e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:03:59,106] [INFO] [timer.py:259:stop] epoch=0/micro_step=37330/global_step=37330, RunningAvgSamplesPerSec=2.6339705313361628, CurrSamplesPerSec=2.594656578692671, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:04:14,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=37340, skipped=0, lr=[6.6968121776668635e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:04:14,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=37340/global_step=37340, RunningAvgSamplesPerSec=2.6339698647563057, CurrSamplesPerSec=2.6390473136744452, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:04:29,700] [INFO] [logging.py:96:log_dist] [Rank 0] step=37350, skipped=0, lr=[6.695227206844184e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:04:29,702] [INFO] [timer.py:259:stop] epoch=0/micro_step=37350/global_step=37350, RunningAvgSamplesPerSec=2.633968061164067, CurrSamplesPerSec=2.577245794110989, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:04:44,984] [INFO] [logging.py:96:log_dist] [Rank 0] step=37360, skipped=0, lr=[6.693642043527908e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:04:44,986] [INFO] [timer.py:259:stop] epoch=0/micro_step=37360/global_step=37360, RunningAvgSamplesPerSec=2.6339672154740246, CurrSamplesPerSec=2.6295607433058232, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:05:00,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=37370, skipped=0, lr=[6.692056687898029e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:05:00,329] [INFO] [timer.py:259:stop] epoch=0/micro_step=37370/global_step=37370, RunningAvgSamplesPerSec=2.6339639894671842, CurrSamplesPerSec=2.603429881184934, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:05:15,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=37380, skipped=0, lr=[6.69047114013457e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:05:15,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=37380/global_step=37380, RunningAvgSamplesPerSec=2.633962435026786, CurrSamplesPerSec=2.6378962709311566, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:05:30,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=37390, skipped=0, lr=[6.688885400417565e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:05:30,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=37390/global_step=37390, RunningAvgSamplesPerSec=2.6339606104316045, CurrSamplesPerSec=2.6457744299941597, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:05:46,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=37400, skipped=0, lr=[6.687299468927079e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:05:46,165] [INFO] [timer.py:259:stop] epoch=0/micro_step=37400/global_step=37400, RunningAvgSamplesPerSec=2.633961555967342, CurrSamplesPerSec=2.6434582750140523, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:06:01,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=37410, skipped=0, lr=[6.685713345843193e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:06:01,443] [INFO] [timer.py:259:stop] epoch=0/micro_step=37410/global_step=37410, RunningAvgSamplesPerSec=2.6339614046021604, CurrSamplesPerSec=2.652704109823256, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:06:16,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=37420, skipped=0, lr=[6.684127031346014e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:06:16,727] [INFO] [timer.py:259:stop] epoch=0/micro_step=37420/global_step=37420, RunningAvgSamplesPerSec=2.633960705045748, CurrSamplesPerSec=2.6298801918977093, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:06:32,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=37430, skipped=0, lr=[6.682540525615666e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:06:32,028] [INFO] [timer.py:259:stop] epoch=0/micro_step=37430/global_step=37430, RunningAvgSamplesPerSec=2.6339589838480153, CurrSamplesPerSec=2.638322711828684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:06:47,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=37440, skipped=0, lr=[6.680953828832299e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:06:47,266] [INFO] [timer.py:259:stop] epoch=0/micro_step=37440/global_step=37440, RunningAvgSamplesPerSec=2.6339600341036737, CurrSamplesPerSec=2.6536976880330996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:07:02,548] [INFO] [logging.py:96:log_dist] [Rank 0] step=37450, skipped=0, lr=[6.679366941176082e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:07:02,565] [INFO] [timer.py:259:stop] epoch=0/micro_step=37450/global_step=37450, RunningAvgSamplesPerSec=2.633958819257081, CurrSamplesPerSec=2.620059885787015, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:07:17,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=37460, skipped=0, lr=[6.677779862827206e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:07:17,785] [INFO] [timer.py:259:stop] epoch=0/micro_step=37460/global_step=37460, RunningAvgSamplesPerSec=2.633961059861393, CurrSamplesPerSec=2.6693450700064543, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:07:33,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=37470, skipped=0, lr=[6.676192593965886e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:07:33,087] [INFO] [timer.py:259:stop] epoch=0/micro_step=37470/global_step=37470, RunningAvgSamplesPerSec=2.633960125443982, CurrSamplesPerSec=2.654146888586056, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:07:48,385] [INFO] [logging.py:96:log_dist] [Rank 0] step=37480, skipped=0, lr=[6.674605134772356e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:07:48,386] [INFO] [timer.py:259:stop] epoch=0/micro_step=37480/global_step=37480, RunningAvgSamplesPerSec=2.6339589161243717, CurrSamplesPerSec=2.59719470278698, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:08:03,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=37490, skipped=0, lr=[6.6730174854268725e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:08:03,720] [INFO] [timer.py:259:stop] epoch=0/micro_step=37490/global_step=37490, RunningAvgSamplesPerSec=2.6339556335815932, CurrSamplesPerSec=2.6253747006261876, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:08:18,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=37500, skipped=0, lr=[6.671429646109714e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:08:19,010] [INFO] [timer.py:259:stop] epoch=0/micro_step=37500/global_step=37500, RunningAvgSamplesPerSec=2.63395500484758, CurrSamplesPerSec=2.6491454275251547, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:08:34,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=37510, skipped=0, lr=[6.66984161700118e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:08:34,332] [INFO] [timer.py:259:stop] epoch=0/micro_step=37510/global_step=37510, RunningAvgSamplesPerSec=2.6339523285848077, CurrSamplesPerSec=2.639078448099053, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:08:49,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=37520, skipped=0, lr=[6.66825339828159e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:08:49,583] [INFO] [timer.py:259:stop] epoch=0/micro_step=37520/global_step=37520, RunningAvgSamplesPerSec=2.6339531356232073, CurrSamplesPerSec=2.5676264828820323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:09:04,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=37530, skipped=0, lr=[6.66666499013129e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:09:04,818] [INFO] [timer.py:259:stop] epoch=0/micro_step=37530/global_step=37530, RunningAvgSamplesPerSec=2.633954487417739, CurrSamplesPerSec=2.6323460588470344, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:09:20,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=37540, skipped=0, lr=[6.66507639273064e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:09:20,103] [INFO] [timer.py:259:stop] epoch=0/micro_step=37540/global_step=37540, RunningAvgSamplesPerSec=2.633953187957025, CurrSamplesPerSec=2.6187708201722457, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:09:35,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=37550, skipped=0, lr=[6.66348760626003e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:09:35,357] [INFO] [timer.py:259:stop] epoch=0/micro_step=37550/global_step=37550, RunningAvgSamplesPerSec=2.633953741173368, CurrSamplesPerSec=2.6359897596980844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:09:50,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=37560, skipped=0, lr=[6.661898630899866e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:09:50,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=37560/global_step=37560, RunningAvgSamplesPerSec=2.6339534962434556, CurrSamplesPerSec=2.6494462220335966, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:10:05,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=37570, skipped=0, lr=[6.660309466830576e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:10:05,921] [INFO] [timer.py:259:stop] epoch=0/micro_step=37570/global_step=37570, RunningAvgSamplesPerSec=2.6339516945268495, CurrSamplesPerSec=2.628829807229644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:10:21,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=37580, skipped=0, lr=[6.65872011423261e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:10:21,092] [INFO] [timer.py:259:stop] epoch=0/micro_step=37580/global_step=37580, RunningAvgSamplesPerSec=2.6339553515074976, CurrSamplesPerSec=2.6319145286825867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:10:36,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=37590, skipped=0, lr=[6.657130573286438e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:10:36,338] [INFO] [timer.py:259:stop] epoch=0/micro_step=37590/global_step=37590, RunningAvgSamplesPerSec=2.633955732807744, CurrSamplesPerSec=2.6286102761643417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:10:51,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=37600, skipped=0, lr=[6.655540844172556e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:10:51,623] [INFO] [timer.py:259:stop] epoch=0/micro_step=37600/global_step=37600, RunningAvgSamplesPerSec=2.6339548445940038, CurrSamplesPerSec=2.6263462633773993, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:11:06,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=37610, skipped=0, lr=[6.6539509270714774e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:11:06,841] [INFO] [timer.py:259:stop] epoch=0/micro_step=37610/global_step=37610, RunningAvgSamplesPerSec=2.633956862931393, CurrSamplesPerSec=2.65180180715108, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:11:22,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=37620, skipped=0, lr=[6.652360822163737e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:11:22,065] [INFO] [timer.py:259:stop] epoch=0/micro_step=37620/global_step=37620, RunningAvgSamplesPerSec=2.6339587088344003, CurrSamplesPerSec=2.6452358830492, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:11:37,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=37630, skipped=0, lr=[6.650770529629892e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:11:37,308] [INFO] [timer.py:259:stop] epoch=0/micro_step=37630/global_step=37630, RunningAvgSamplesPerSec=2.6339593288867307, CurrSamplesPerSec=2.6375562126192134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:11:52,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=37640, skipped=0, lr=[6.649180049650521e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:11:52,535] [INFO] [timer.py:259:stop] epoch=0/micro_step=37640/global_step=37640, RunningAvgSamplesPerSec=2.633961493851686, CurrSamplesPerSec=2.6287255974806585, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:12:07,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=37650, skipped=0, lr=[6.647589382406222e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:12:07,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=37650/global_step=37650, RunningAvgSamplesPerSec=2.633960162050033, CurrSamplesPerSec=2.6348273129068156, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:12:23,154] [INFO] [logging.py:96:log_dist] [Rank 0] step=37660, skipped=0, lr=[6.645998528077617e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:12:23,165] [INFO] [timer.py:259:stop] epoch=0/micro_step=37660/global_step=37660, RunningAvgSamplesPerSec=2.6339582171208, CurrSamplesPerSec=2.6626096894203033, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:12:38,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=37670, skipped=0, lr=[6.644407486845349e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:12:38,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=37670/global_step=37670, RunningAvgSamplesPerSec=2.633955425733951, CurrSamplesPerSec=2.5899157901123826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:12:53,812] [INFO] [logging.py:96:log_dist] [Rank 0] step=37680, skipped=0, lr=[6.6428162588900805e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:12:53,813] [INFO] [timer.py:259:stop] epoch=0/micro_step=37680/global_step=37680, RunningAvgSamplesPerSec=2.633953953713498, CurrSamplesPerSec=2.6298781306902947, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:13:09,029] [INFO] [logging.py:96:log_dist] [Rank 0] step=37690, skipped=0, lr=[6.641224844392493e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:13:09,030] [INFO] [timer.py:259:stop] epoch=0/micro_step=37690/global_step=37690, RunningAvgSamplesPerSec=2.633956381182101, CurrSamplesPerSec=2.61895191616826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:13:24,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=37700, skipped=0, lr=[6.639633243533297e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:13:24,345] [INFO] [timer.py:259:stop] epoch=0/micro_step=37700/global_step=37700, RunningAvgSamplesPerSec=2.6339556656916794, CurrSamplesPerSec=2.63121240744843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:13:39,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=37710, skipped=0, lr=[6.638041456493218e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:13:39,736] [INFO] [timer.py:259:stop] epoch=0/micro_step=37710/global_step=37710, RunningAvgSamplesPerSec=2.633951467411223, CurrSamplesPerSec=2.59963950664965, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:13:55,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=37720, skipped=0, lr=[6.636449483453001e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:13:55,021] [INFO] [timer.py:259:stop] epoch=0/micro_step=37720/global_step=37720, RunningAvgSamplesPerSec=2.6339517667482437, CurrSamplesPerSec=2.6251684799065025, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:14:10,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=37730, skipped=0, lr=[6.634857324593418e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:14:10,283] [INFO] [timer.py:259:stop] epoch=0/micro_step=37730/global_step=37730, RunningAvgSamplesPerSec=2.633953149267129, CurrSamplesPerSec=2.600430071343284, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:14:25,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=37740, skipped=0, lr=[6.6332649800952574e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:14:25,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=37740/global_step=37740, RunningAvgSamplesPerSec=2.633954902453709, CurrSamplesPerSec=2.630557267214618, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:14:40,763] [INFO] [logging.py:96:log_dist] [Rank 0] step=37750, skipped=0, lr=[6.631672450139331e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:14:40,764] [INFO] [timer.py:259:stop] epoch=0/micro_step=37750/global_step=37750, RunningAvgSamplesPerSec=2.6339555098198884, CurrSamplesPerSec=2.6441165240641276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:14:55,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=37760, skipped=0, lr=[6.6300797349064724e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:14:55,992] [INFO] [timer.py:259:stop] epoch=0/micro_step=37760/global_step=37760, RunningAvgSamplesPerSec=2.6339567029868225, CurrSamplesPerSec=2.624566435083957, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:15:11,265] [INFO] [logging.py:96:log_dist] [Rank 0] step=37770, skipped=0, lr=[6.628486834577535e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:15:11,267] [INFO] [timer.py:259:stop] epoch=0/micro_step=37770/global_step=37770, RunningAvgSamplesPerSec=2.6339559959415455, CurrSamplesPerSec=2.636710179764003, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:15:26,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=37780, skipped=0, lr=[6.62689374933339e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:15:26,523] [INFO] [timer.py:259:stop] epoch=0/micro_step=37780/global_step=37780, RunningAvgSamplesPerSec=2.6339565110491785, CurrSamplesPerSec=2.63874514036133, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:15:41,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=37790, skipped=0, lr=[6.625300479354934e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:15:41,875] [INFO] [timer.py:259:stop] epoch=0/micro_step=37790/global_step=37790, RunningAvgSamplesPerSec=2.633953001984528, CurrSamplesPerSec=2.637166082991477, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:15:57,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=37800, skipped=0, lr=[6.623707024823087e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:15:57,171] [INFO] [timer.py:259:stop] epoch=0/micro_step=37800/global_step=37800, RunningAvgSamplesPerSec=2.6339515352375757, CurrSamplesPerSec=2.6325620835405488, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:16:12,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=37810, skipped=0, lr=[6.622113385918781e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:16:12,558] [INFO] [timer.py:259:stop] epoch=0/micro_step=37810/global_step=37810, RunningAvgSamplesPerSec=2.63394604460881, CurrSamplesPerSec=2.634090965042039, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:16:27,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=37820, skipped=0, lr=[6.620519562822978e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:16:27,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=37820/global_step=37820, RunningAvgSamplesPerSec=2.6339446509352995, CurrSamplesPerSec=2.6354498064431833, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:16:43,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=37830, skipped=0, lr=[6.618925555716654e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:16:43,140] [INFO] [timer.py:259:stop] epoch=0/micro_step=37830/global_step=37830, RunningAvgSamplesPerSec=2.633943041097408, CurrSamplesPerSec=2.632052024848587, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:16:58,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=37840, skipped=0, lr=[6.617331364780813e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:16:58,443] [INFO] [timer.py:259:stop] epoch=0/micro_step=37840/global_step=37840, RunningAvgSamplesPerSec=2.6339410866596884, CurrSamplesPerSec=2.590530440719517, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:17:13,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=37850, skipped=0, lr=[6.615736990196472e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:17:13,693] [INFO] [timer.py:259:stop] epoch=0/micro_step=37850/global_step=37850, RunningAvgSamplesPerSec=2.633941446795137, CurrSamplesPerSec=2.6379966463621525, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:17:28,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=37860, skipped=0, lr=[6.614142432144675e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:17:28,950] [INFO] [timer.py:259:stop] epoch=0/micro_step=37860/global_step=37860, RunningAvgSamplesPerSec=2.633941683749477, CurrSamplesPerSec=2.622142572659269, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:17:44,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=37870, skipped=0, lr=[6.6125476908064836e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:17:44,269] [INFO] [timer.py:259:stop] epoch=0/micro_step=37870/global_step=37870, RunningAvgSamplesPerSec=2.6339396881633275, CurrSamplesPerSec=2.6369642227518986, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:17:59,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=37880, skipped=0, lr=[6.610952766362979e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:17:59,568] [INFO] [timer.py:259:stop] epoch=0/micro_step=37880/global_step=37880, RunningAvgSamplesPerSec=2.633937920718819, CurrSamplesPerSec=2.6009130277500843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:18:14,830] [INFO] [logging.py:96:log_dist] [Rank 0] step=37890, skipped=0, lr=[6.6093576589952715e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:18:14,839] [INFO] [timer.py:259:stop] epoch=0/micro_step=37890/global_step=37890, RunningAvgSamplesPerSec=2.6339376879506693, CurrSamplesPerSec=2.634657668200949, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:18:30,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=37900, skipped=0, lr=[6.607762368884481e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:18:30,108] [INFO] [timer.py:259:stop] epoch=0/micro_step=37900/global_step=37900, RunningAvgSamplesPerSec=2.6339374424737643, CurrSamplesPerSec=2.6217909946027964, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:18:45,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=37910, skipped=0, lr=[6.606166896211756e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:18:45,326] [INFO] [timer.py:259:stop] epoch=0/micro_step=37910/global_step=37910, RunningAvgSamplesPerSec=2.6339389154911412, CurrSamplesPerSec=2.654055776741732, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:19:00,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=37920, skipped=0, lr=[6.604571241158259e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:19:00,538] [INFO] [timer.py:259:stop] epoch=0/micro_step=37920/global_step=37920, RunningAvgSamplesPerSec=2.633940955911174, CurrSamplesPerSec=2.6291091136243496, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:19:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] step=37930, skipped=0, lr=[6.602975403905182e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:19:15,888] [INFO] [timer.py:259:stop] epoch=0/micro_step=37930/global_step=37930, RunningAvgSamplesPerSec=2.6339381856084296, CurrSamplesPerSec=2.645056138109195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:19:31,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=37940, skipped=0, lr=[6.6013793846337295e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:19:31,263] [INFO] [timer.py:259:stop] epoch=0/micro_step=37940/global_step=37940, RunningAvgSamplesPerSec=2.6339336114392715, CurrSamplesPerSec=2.629486972017829, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:19:46,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=37950, skipped=0, lr=[6.599783183525131e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:19:46,573] [INFO] [timer.py:259:stop] epoch=0/micro_step=37950/global_step=37950, RunningAvgSamplesPerSec=2.6339316709628275, CurrSamplesPerSec=2.568549474970638, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:20:01,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=37960, skipped=0, lr=[6.598186800760637e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:20:01,900] [INFO] [timer.py:259:stop] epoch=0/micro_step=37960/global_step=37960, RunningAvgSamplesPerSec=2.633929157869372, CurrSamplesPerSec=2.6309607091904157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:20:17,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=37970, skipped=0, lr=[6.596590236521515e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:20:17,179] [INFO] [timer.py:259:stop] epoch=0/micro_step=37970/global_step=37970, RunningAvgSamplesPerSec=2.6339282224323983, CurrSamplesPerSec=2.6393541235839533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:20:32,488] [INFO] [logging.py:96:log_dist] [Rank 0] step=37980, skipped=0, lr=[6.594993490989057e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:20:32,489] [INFO] [timer.py:259:stop] epoch=0/micro_step=37980/global_step=37980, RunningAvgSamplesPerSec=2.6339257917357366, CurrSamplesPerSec=2.6466784802167838, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:20:47,742] [INFO] [logging.py:96:log_dist] [Rank 0] step=37990, skipped=0, lr=[6.593396564344573e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:20:47,757] [INFO] [timer.py:259:stop] epoch=0/micro_step=37990/global_step=37990, RunningAvgSamplesPerSec=2.633925353722999, CurrSamplesPerSec=2.61023873378679, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:21:03,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=38000, skipped=0, lr=[6.591799456769395e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:21:03,019] [INFO] [timer.py:259:stop] epoch=0/micro_step=38000/global_step=38000, RunningAvgSamplesPerSec=2.6339254699768, CurrSamplesPerSec=2.6288396931684943, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:21:18,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=38010, skipped=0, lr=[6.590202168444875e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:21:18,255] [INFO] [timer.py:259:stop] epoch=0/micro_step=38010/global_step=38010, RunningAvgSamplesPerSec=2.633926146816021, CurrSamplesPerSec=2.641330375661555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:21:33,484] [INFO] [logging.py:96:log_dist] [Rank 0] step=38020, skipped=0, lr=[6.588604699552386e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:21:33,492] [INFO] [timer.py:259:stop] epoch=0/micro_step=38020/global_step=38020, RunningAvgSamplesPerSec=2.6339274222919227, CurrSamplesPerSec=2.6436657130754555, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:21:48,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=38030, skipped=0, lr=[6.587007050273319e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:21:48,756] [INFO] [timer.py:259:stop] epoch=0/micro_step=38030/global_step=38030, RunningAvgSamplesPerSec=2.6339274041398526, CurrSamplesPerSec=2.6146887126529585, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:22:03,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=38040, skipped=0, lr=[6.5854092207890905e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:22:04,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=38040/global_step=38040, RunningAvgSamplesPerSec=2.633927590176715, CurrSamplesPerSec=2.6490973234733812, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:22:19,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=38050, skipped=0, lr=[6.583811211281133e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:22:19,287] [INFO] [timer.py:259:stop] epoch=0/micro_step=38050/global_step=38050, RunningAvgSamplesPerSec=2.633926438756885, CurrSamplesPerSec=2.5767601093300807, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:22:34,489] [INFO] [logging.py:96:log_dist] [Rank 0] step=38060, skipped=0, lr=[6.582213021930902e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:22:34,509] [INFO] [timer.py:259:stop] epoch=0/micro_step=38060/global_step=38060, RunningAvgSamplesPerSec=2.6339276542626817, CurrSamplesPerSec=2.656676583161779, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:22:49,792] [INFO] [logging.py:96:log_dist] [Rank 0] step=38070, skipped=0, lr=[6.580614652919872e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:22:49,793] [INFO] [timer.py:259:stop] epoch=0/micro_step=38070/global_step=38070, RunningAvgSamplesPerSec=2.6339262936989347, CurrSamplesPerSec=2.593631330337392, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:23:05,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=38080, skipped=0, lr=[6.579016104429536e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:23:05,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=38080/global_step=38080, RunningAvgSamplesPerSec=2.633927418534789, CurrSamplesPerSec=2.643408711300071, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:23:20,265] [INFO] [logging.py:96:log_dist] [Rank 0] step=38090, skipped=0, lr=[6.577417376641414e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:23:20,276] [INFO] [timer.py:259:stop] epoch=0/micro_step=38090/global_step=38090, RunningAvgSamplesPerSec=2.633927149643287, CurrSamplesPerSec=2.5986590204264273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:23:35,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=38100, skipped=0, lr=[6.575818469737039e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:23:35,603] [INFO] [timer.py:259:stop] epoch=0/micro_step=38100/global_step=38100, RunningAvgSamplesPerSec=2.6339248413512437, CurrSamplesPerSec=2.637259355347146, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:23:50,912] [INFO] [logging.py:96:log_dist] [Rank 0] step=38110, skipped=0, lr=[6.574219383897969e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:23:50,935] [INFO] [timer.py:259:stop] epoch=0/micro_step=38110/global_step=38110, RunningAvgSamplesPerSec=2.6339216896870266, CurrSamplesPerSec=2.6539180711724173, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:24:06,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=38120, skipped=0, lr=[6.572620119305779e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:24:06,176] [INFO] [timer.py:259:stop] epoch=0/micro_step=38120/global_step=38120, RunningAvgSamplesPerSec=2.633922550860208, CurrSamplesPerSec=2.641552036621306, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:24:21,485] [INFO] [logging.py:96:log_dist] [Rank 0] step=38130, skipped=0, lr=[6.571020676142067e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:24:21,499] [INFO] [timer.py:259:stop] epoch=0/micro_step=38130/global_step=38130, RunningAvgSamplesPerSec=2.6339191757866964, CurrSamplesPerSec=2.6340421655161212, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:24:36,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=38140, skipped=0, lr=[6.56942105458845e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:24:36,747] [INFO] [timer.py:259:stop] epoch=0/micro_step=38140/global_step=38140, RunningAvgSamplesPerSec=2.633919732156906, CurrSamplesPerSec=2.6343908323257486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:24:52,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=38150, skipped=0, lr=[6.567821254826566e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:24:52,016] [INFO] [timer.py:259:stop] epoch=0/micro_step=38150/global_step=38150, RunningAvgSamplesPerSec=2.633919061875477, CurrSamplesPerSec=2.6351070674979193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:25:07,243] [INFO] [logging.py:96:log_dist] [Rank 0] step=38160, skipped=0, lr=[6.566221277038074e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:25:07,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=38160/global_step=38160, RunningAvgSamplesPerSec=2.633920698561204, CurrSamplesPerSec=2.640311546619605, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:25:22,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=38170, skipped=0, lr=[6.5646211214046485e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:25:22,511] [INFO] [timer.py:259:stop] epoch=0/micro_step=38170/global_step=38170, RunningAvgSamplesPerSec=2.633920480856315, CurrSamplesPerSec=2.6401702778594194, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:25:37,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=38180, skipped=0, lr=[6.563020788107991e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:25:37,805] [INFO] [timer.py:259:stop] epoch=0/micro_step=38180/global_step=38180, RunningAvgSamplesPerSec=2.6339187789628977, CurrSamplesPerSec=2.6377054957151826, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:25:53,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=38190, skipped=0, lr=[6.561420277329818e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:25:53,100] [INFO] [timer.py:259:stop] epoch=0/micro_step=38190/global_step=38190, RunningAvgSamplesPerSec=2.6339169346622415, CurrSamplesPerSec=2.6191829214485147, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:26:08,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=38200, skipped=0, lr=[6.559819589251869e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:26:08,364] [INFO] [timer.py:259:stop] epoch=0/micro_step=38200/global_step=38200, RunningAvgSamplesPerSec=2.633916639164752, CurrSamplesPerSec=2.634748280581913, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:26:23,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=38210, skipped=0, lr=[6.558218724055902e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:26:23,599] [INFO] [timer.py:259:stop] epoch=0/micro_step=38210/global_step=38210, RunningAvgSamplesPerSec=2.6339175885569746, CurrSamplesPerSec=2.6368419610438694, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:26:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=38220, skipped=0, lr=[6.556617681923695e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:26:38,936] [INFO] [timer.py:259:stop] epoch=0/micro_step=38220/global_step=38220, RunningAvgSamplesPerSec=2.6339149296806004, CurrSamplesPerSec=2.5992653447491234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:26:54,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=38230, skipped=0, lr=[6.555016463037049e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:26:54,178] [INFO] [timer.py:259:stop] epoch=0/micro_step=38230/global_step=38230, RunningAvgSamplesPerSec=2.6339155751324594, CurrSamplesPerSec=2.628606157733037, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:27:09,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=38240, skipped=0, lr=[6.553415067577782e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:27:09,448] [INFO] [timer.py:259:stop] epoch=0/micro_step=38240/global_step=38240, RunningAvgSamplesPerSec=2.6339154164380667, CurrSamplesPerSec=2.5985897902279613, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:27:24,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=38250, skipped=0, lr=[6.5518134957277335e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:27:24,695] [INFO] [timer.py:259:stop] epoch=0/micro_step=38250/global_step=38250, RunningAvgSamplesPerSec=2.633915828519016, CurrSamplesPerSec=2.6378382059876424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:27:40,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=38260, skipped=0, lr=[6.550211747668761e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:27:40,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=38260/global_step=38260, RunningAvgSamplesPerSec=2.6339131135265337, CurrSamplesPerSec=2.6139924926530886, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:27:55,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=38270, skipped=0, lr=[6.5486098235827434e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:27:55,335] [INFO] [timer.py:259:stop] epoch=0/micro_step=38270/global_step=38270, RunningAvgSamplesPerSec=2.6339115673568396, CurrSamplesPerSec=2.6239810820150486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:28:10,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=38280, skipped=0, lr=[6.5470077236515804e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:28:10,610] [INFO] [timer.py:259:stop] epoch=0/micro_step=38280/global_step=38280, RunningAvgSamplesPerSec=2.6339114034157514, CurrSamplesPerSec=2.63585847770125, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:28:25,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=38290, skipped=0, lr=[6.545405448057192e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:28:25,877] [INFO] [timer.py:259:stop] epoch=0/micro_step=38290/global_step=38290, RunningAvgSamplesPerSec=2.6339116469973716, CurrSamplesPerSec=2.637775996384731, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:28:41,144] [INFO] [logging.py:96:log_dist] [Rank 0] step=38300, skipped=0, lr=[6.5438029969815166e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:28:41,151] [INFO] [timer.py:259:stop] epoch=0/micro_step=38300/global_step=38300, RunningAvgSamplesPerSec=2.63391130802484, CurrSamplesPerSec=2.616666956487673, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:28:56,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=38310, skipped=0, lr=[6.542200370606513e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:28:56,451] [INFO] [timer.py:259:stop] epoch=0/micro_step=38310/global_step=38310, RunningAvgSamplesPerSec=2.6339103514849036, CurrSamplesPerSec=2.636334799651595, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:29:11,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=38320, skipped=0, lr=[6.540597569114158e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:29:11,738] [INFO] [timer.py:259:stop] epoch=0/micro_step=38320/global_step=38320, RunningAvgSamplesPerSec=2.633909554618504, CurrSamplesPerSec=2.6312664669626993, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:29:27,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=38330, skipped=0, lr=[6.538994592686453e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:29:27,020] [INFO] [timer.py:259:stop] epoch=0/micro_step=38330/global_step=38330, RunningAvgSamplesPerSec=2.633908763951094, CurrSamplesPerSec=2.6398470785617265, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:29:42,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=38340, skipped=0, lr=[6.537391441505415e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:29:42,326] [INFO] [timer.py:259:stop] epoch=0/micro_step=38340/global_step=38340, RunningAvgSamplesPerSec=2.633907041211733, CurrSamplesPerSec=2.6260194520090296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:29:57,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=38350, skipped=0, lr=[6.535788115753083e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:29:57,642] [INFO] [timer.py:259:stop] epoch=0/micro_step=38350/global_step=38350, RunningAvgSamplesPerSec=2.6339045406931776, CurrSamplesPerSec=2.626784194782122, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:30:12,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=38360, skipped=0, lr=[6.534184615611515e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:30:12,941] [INFO] [timer.py:259:stop] epoch=0/micro_step=38360/global_step=38360, RunningAvgSamplesPerSec=2.6339027295025943, CurrSamplesPerSec=2.6258912161848786, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:30:28,205] [INFO] [logging.py:96:log_dist] [Rank 0] step=38370, skipped=0, lr=[6.532580941262787e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:30:28,209] [INFO] [timer.py:259:stop] epoch=0/micro_step=38370/global_step=38370, RunningAvgSamplesPerSec=2.633902066340613, CurrSamplesPerSec=2.6488982331077047, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:30:43,455] [INFO] [logging.py:96:log_dist] [Rank 0] step=38380, skipped=0, lr=[6.530977092889002e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:30:43,456] [INFO] [timer.py:259:stop] epoch=0/micro_step=38380/global_step=38380, RunningAvgSamplesPerSec=2.6339022447971443, CurrSamplesPerSec=2.617013895796193, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:30:58,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=38390, skipped=0, lr=[6.529373070672271e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:30:58,731] [INFO] [timer.py:259:stop] epoch=0/micro_step=38390/global_step=38390, RunningAvgSamplesPerSec=2.633901246849249, CurrSamplesPerSec=2.616942051398798, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:31:13,980] [INFO] [logging.py:96:log_dist] [Rank 0] step=38400, skipped=0, lr=[6.527768874794735e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:31:13,987] [INFO] [timer.py:259:stop] epoch=0/micro_step=38400/global_step=38400, RunningAvgSamplesPerSec=2.6339013200418844, CurrSamplesPerSec=2.652133809976712, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:31:29,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=38410, skipped=0, lr=[6.526164505438551e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:31:29,257] [INFO] [timer.py:259:stop] epoch=0/micro_step=38410/global_step=38410, RunningAvgSamplesPerSec=2.6339005334609387, CurrSamplesPerSec=2.6135392731034583, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:31:44,482] [INFO] [logging.py:96:log_dist] [Rank 0] step=38420, skipped=0, lr=[6.524559962785894e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:31:44,493] [INFO] [timer.py:259:stop] epoch=0/micro_step=38420/global_step=38420, RunningAvgSamplesPerSec=2.6339017957551922, CurrSamplesPerSec=2.6371424550420963, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:31:59,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=38430, skipped=0, lr=[6.522955247018962e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:31:59,750] [INFO] [timer.py:259:stop] epoch=0/micro_step=38430/global_step=38430, RunningAvgSamplesPerSec=2.6339013504588693, CurrSamplesPerSec=2.585752496802405, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:32:14,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=38440, skipped=0, lr=[6.521350358319969e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:32:14,989] [INFO] [timer.py:259:stop] epoch=0/micro_step=38440/global_step=38440, RunningAvgSamplesPerSec=2.6339021357388335, CurrSamplesPerSec=2.6275967077620392, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:32:30,265] [INFO] [logging.py:96:log_dist] [Rank 0] step=38450, skipped=0, lr=[6.519745296871153e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:32:30,267] [INFO] [timer.py:259:stop] epoch=0/micro_step=38450/global_step=38450, RunningAvgSamplesPerSec=2.6339012379236384, CurrSamplesPerSec=2.60375351834023, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:32:45,512] [INFO] [logging.py:96:log_dist] [Rank 0] step=38460, skipped=0, lr=[6.518140062854768e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:32:45,526] [INFO] [timer.py:259:stop] epoch=0/micro_step=38460/global_step=38460, RunningAvgSamplesPerSec=2.6339012984997363, CurrSamplesPerSec=2.6302727046712953, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:33:00,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=38470, skipped=0, lr=[6.516534656453089e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:33:00,826] [INFO] [timer.py:259:stop] epoch=0/micro_step=38470/global_step=38470, RunningAvgSamplesPerSec=2.6338995071376177, CurrSamplesPerSec=2.6232388884024047, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:33:16,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=38480, skipped=0, lr=[6.51492907784841e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:33:16,065] [INFO] [timer.py:259:stop] epoch=0/micro_step=38480/global_step=38480, RunningAvgSamplesPerSec=2.6339008778915027, CurrSamplesPerSec=2.6334029760068267, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:33:31,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=38490, skipped=0, lr=[6.513323327223045e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:33:31,332] [INFO] [timer.py:259:stop] epoch=0/micro_step=38490/global_step=38490, RunningAvgSamplesPerSec=2.6339007026361285, CurrSamplesPerSec=2.660525111651867, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:33:46,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=38500, skipped=0, lr=[6.511717404759328e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:33:46,604] [INFO] [timer.py:259:stop] epoch=0/micro_step=38500/global_step=38500, RunningAvgSamplesPerSec=2.6339003691294853, CurrSamplesPerSec=2.635548339690659, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:34:01,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=38510, skipped=0, lr=[6.510111310639613e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:34:01,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=38510/global_step=38510, RunningAvgSamplesPerSec=2.633898256246077, CurrSamplesPerSec=2.622714805649437, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:34:17,176] [INFO] [logging.py:96:log_dist] [Rank 0] step=38520, skipped=0, lr=[6.508505045046271e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:34:17,178] [INFO] [timer.py:259:stop] epoch=0/micro_step=38520/global_step=38520, RunningAvgSamplesPerSec=2.6338985792392724, CurrSamplesPerSec=2.6579817696103465, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:34:32,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=38530, skipped=0, lr=[6.506898608161694e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:34:32,501] [INFO] [timer.py:259:stop] epoch=0/micro_step=38530/global_step=38530, RunningAvgSamplesPerSec=2.633896774783881, CurrSamplesPerSec=2.633969796666978, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:34:47,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=38540, skipped=0, lr=[6.505292000168293e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:34:47,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=38540/global_step=38540, RunningAvgSamplesPerSec=2.6338976650595263, CurrSamplesPerSec=2.616108781398843, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:35:02,987] [INFO] [logging.py:96:log_dist] [Rank 0] step=38550, skipped=0, lr=[6.503685221248499e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:35:03,017] [INFO] [timer.py:259:stop] epoch=0/micro_step=38550/global_step=38550, RunningAvgSamplesPerSec=2.633896931565206, CurrSamplesPerSec=2.6132705917970296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:35:18,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=38560, skipped=0, lr=[6.502078271584764e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:35:18,289] [INFO] [timer.py:259:stop] epoch=0/micro_step=38560/global_step=38560, RunningAvgSamplesPerSec=2.6338962648664244, CurrSamplesPerSec=2.6356224514835405, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:35:33,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=38570, skipped=0, lr=[6.500471151359556e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:35:33,841] [INFO] [timer.py:259:stop] epoch=0/micro_step=38570/global_step=38570, RunningAvgSamplesPerSec=2.6338840785319917, CurrSamplesPerSec=2.57910789545055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:35:49,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=38580, skipped=0, lr=[6.498863860755366e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:35:49,186] [INFO] [timer.py:259:stop] epoch=0/micro_step=38580/global_step=38580, RunningAvgSamplesPerSec=2.6338807906586172, CurrSamplesPerSec=2.6114328285107704, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:36:04,495] [INFO] [logging.py:96:log_dist] [Rank 0] step=38590, skipped=0, lr=[6.4972563999547e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:36:04,497] [INFO] [timer.py:259:stop] epoch=0/micro_step=38590/global_step=38590, RunningAvgSamplesPerSec=2.633878876156482, CurrSamplesPerSec=2.6224335768553972, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:36:19,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=38600, skipped=0, lr=[6.495648769140087e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:36:19,764] [INFO] [timer.py:259:stop] epoch=0/micro_step=38600/global_step=38600, RunningAvgSamplesPerSec=2.633878787289685, CurrSamplesPerSec=2.643287100650288, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:36:35,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=38610, skipped=0, lr=[6.494040968494073e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:36:35,009] [INFO] [timer.py:259:stop] epoch=0/micro_step=38610/global_step=38610, RunningAvgSamplesPerSec=2.6338794933314396, CurrSamplesPerSec=2.640922084568281, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:36:50,304] [INFO] [logging.py:96:log_dist] [Rank 0] step=38620, skipped=0, lr=[6.492432998199224e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:36:50,306] [INFO] [timer.py:259:stop] epoch=0/micro_step=38620/global_step=38620, RunningAvgSamplesPerSec=2.6338774247662684, CurrSamplesPerSec=2.588329924143352, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:37:05,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=38630, skipped=0, lr=[6.490824858438127e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:37:05,653] [INFO] [timer.py:259:stop] epoch=0/micro_step=38630/global_step=38630, RunningAvgSamplesPerSec=2.6338740868174426, CurrSamplesPerSec=2.6334128963712424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:37:20,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=38640, skipped=0, lr=[6.489216549393387e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:37:20,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=38640/global_step=38640, RunningAvgSamplesPerSec=2.6338724961775033, CurrSamplesPerSec=2.6541708222277336, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:37:36,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=38650, skipped=0, lr=[6.487608071247627e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:37:36,261] [INFO] [timer.py:259:stop] epoch=0/micro_step=38650/global_step=38650, RunningAvgSamplesPerSec=2.6338706805287453, CurrSamplesPerSec=2.6256142366704336, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:37:51,533] [INFO] [logging.py:96:log_dist] [Rank 0] step=38660, skipped=0, lr=[6.485999424183491e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:37:51,534] [INFO] [timer.py:259:stop] epoch=0/micro_step=38660/global_step=38660, RunningAvgSamplesPerSec=2.6338704418525625, CurrSamplesPerSec=2.644559151669924, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:38:06,759] [INFO] [logging.py:96:log_dist] [Rank 0] step=38670, skipped=0, lr=[6.48439060838364e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:38:06,760] [INFO] [timer.py:259:stop] epoch=0/micro_step=38670/global_step=38670, RunningAvgSamplesPerSec=2.6338720407429674, CurrSamplesPerSec=2.6423254380500367, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:38:22,066] [INFO] [logging.py:96:log_dist] [Rank 0] step=38680, skipped=0, lr=[6.482781624030756e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:38:22,071] [INFO] [timer.py:259:stop] epoch=0/micro_step=38680/global_step=38680, RunningAvgSamplesPerSec=2.633869990848896, CurrSamplesPerSec=2.6350656799387644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:38:37,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=38690, skipped=0, lr=[6.48117247130754e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:38:37,338] [INFO] [timer.py:259:stop] epoch=0/micro_step=38690/global_step=38690, RunningAvgSamplesPerSec=2.6338697020892923, CurrSamplesPerSec=2.631817505499393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:38:52,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=38700, skipped=0, lr=[6.479563150396714e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:38:52,608] [INFO] [timer.py:259:stop] epoch=0/micro_step=38700/global_step=38700, RunningAvgSamplesPerSec=2.633869557877927, CurrSamplesPerSec=2.6294989235097637, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:39:07,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=38710, skipped=0, lr=[6.477953661481011e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:39:07,868] [INFO] [timer.py:259:stop] epoch=0/micro_step=38710/global_step=38710, RunningAvgSamplesPerSec=2.633870162800173, CurrSamplesPerSec=2.6321953169197956, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:39:23,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=38720, skipped=0, lr=[6.476344004743197e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:39:23,161] [INFO] [timer.py:259:stop] epoch=0/micro_step=38720/global_step=38720, RunningAvgSamplesPerSec=2.633868930120392, CurrSamplesPerSec=2.6338114261457344, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:39:38,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=38730, skipped=0, lr=[6.474734180366043e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:39:38,423] [INFO] [timer.py:259:stop] epoch=0/micro_step=38730/global_step=38730, RunningAvgSamplesPerSec=2.633869566911952, CurrSamplesPerSec=2.634926627262356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:39:53,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=38740, skipped=0, lr=[6.473124188532347e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:39:53,661] [INFO] [timer.py:259:stop] epoch=0/micro_step=38740/global_step=38740, RunningAvgSamplesPerSec=2.633870360882218, CurrSamplesPerSec=2.6400257007823273, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:40:08,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=38750, skipped=0, lr=[6.471514029424923e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:40:08,885] [INFO] [timer.py:259:stop] epoch=0/micro_step=38750/global_step=38750, RunningAvgSamplesPerSec=2.6338724178263395, CurrSamplesPerSec=2.6423441650542165, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:40:24,132] [INFO] [logging.py:96:log_dist] [Rank 0] step=38760, skipped=0, lr=[6.469903703226607e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:40:24,133] [INFO] [timer.py:259:stop] epoch=0/micro_step=38760/global_step=38760, RunningAvgSamplesPerSec=2.6338723661782564, CurrSamplesPerSec=2.6379796400698576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:40:39,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=38770, skipped=0, lr=[6.468293210120251e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:40:39,441] [INFO] [timer.py:259:stop] epoch=0/micro_step=38770/global_step=38770, RunningAvgSamplesPerSec=2.6338706660628635, CurrSamplesPerSec=2.609765298813614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:40:54,660] [INFO] [logging.py:96:log_dist] [Rank 0] step=38780, skipped=0, lr=[6.466682550288726e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:40:54,673] [INFO] [timer.py:259:stop] epoch=0/micro_step=38780/global_step=38780, RunningAvgSamplesPerSec=2.633871984889312, CurrSamplesPerSec=2.6473216249512532, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:41:09,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=38790, skipped=0, lr=[6.465071723914926e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:41:09,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=38790/global_step=38790, RunningAvgSamplesPerSec=2.6338722144853115, CurrSamplesPerSec=2.6314757105996973, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:41:25,176] [INFO] [logging.py:96:log_dist] [Rank 0] step=38800, skipped=0, lr=[6.463460731181759e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:41:25,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=38800/global_step=38800, RunningAvgSamplesPerSec=2.6338735964785576, CurrSamplesPerSec=2.6396373321625393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:41:40,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=38810, skipped=0, lr=[6.461849572272152e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:41:40,440] [INFO] [timer.py:259:stop] epoch=0/micro_step=38810/global_step=38810, RunningAvgSamplesPerSec=2.6338735847621666, CurrSamplesPerSec=2.5969225377858542, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:41:55,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=38820, skipped=0, lr=[6.460238247369055e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:41:55,710] [INFO] [timer.py:259:stop] epoch=0/micro_step=38820/global_step=38820, RunningAvgSamplesPerSec=2.6338728316951494, CurrSamplesPerSec=2.6381248227896643, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:42:10,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=38830, skipped=0, lr=[6.458626756655435e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:42:10,952] [INFO] [timer.py:259:stop] epoch=0/micro_step=38830/global_step=38830, RunningAvgSamplesPerSec=2.6338734886547233, CurrSamplesPerSec=2.645004012475698, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:42:26,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=38840, skipped=0, lr=[6.457015100314274e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:42:26,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=38840/global_step=38840, RunningAvgSamplesPerSec=2.633875761950965, CurrSamplesPerSec=2.647499171229993, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:42:41,475] [INFO] [logging.py:96:log_dist] [Rank 0] step=38850, skipped=0, lr=[6.45540327852858e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:42:41,476] [INFO] [timer.py:259:stop] epoch=0/micro_step=38850/global_step=38850, RunningAvgSamplesPerSec=2.633875411095185, CurrSamplesPerSec=2.6280786933733697, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:42:56,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=38860, skipped=0, lr=[6.453791291481373e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:42:56,763] [INFO] [timer.py:259:stop] epoch=0/micro_step=38860/global_step=38860, RunningAvgSamplesPerSec=2.6338749737582736, CurrSamplesPerSec=2.638494073699496, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:43:12,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=38870, skipped=0, lr=[6.452179139355697e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:43:12,031] [INFO] [timer.py:259:stop] epoch=0/micro_step=38870/global_step=38870, RunningAvgSamplesPerSec=2.6338753997756355, CurrSamplesPerSec=2.6412226776682144, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:43:27,390] [INFO] [logging.py:96:log_dist] [Rank 0] step=38880, skipped=0, lr=[6.45056682233461e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:43:27,391] [INFO] [timer.py:259:stop] epoch=0/micro_step=38880/global_step=38880, RunningAvgSamplesPerSec=2.6338721300928656, CurrSamplesPerSec=2.622899727998629, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:43:42,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=38890, skipped=0, lr=[6.448954340601191e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:43:42,674] [INFO] [timer.py:259:stop] epoch=0/micro_step=38890/global_step=38890, RunningAvgSamplesPerSec=2.63387176688016, CurrSamplesPerSec=2.62152225259383, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:43:57,977] [INFO] [logging.py:96:log_dist] [Rank 0] step=38900, skipped=0, lr=[6.44734169433854e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:43:57,978] [INFO] [timer.py:259:stop] epoch=0/micro_step=38900/global_step=38900, RunningAvgSamplesPerSec=2.63387058430766, CurrSamplesPerSec=2.6230494070863495, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:44:13,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=38910, skipped=0, lr=[6.445728883729773e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:44:13,262] [INFO] [timer.py:259:stop] epoch=0/micro_step=38910/global_step=38910, RunningAvgSamplesPerSec=2.6338705736506456, CurrSamplesPerSec=2.635515632459668, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:44:28,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=38920, skipped=0, lr=[6.444115908958024e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:44:28,551] [INFO] [timer.py:259:stop] epoch=0/micro_step=38920/global_step=38920, RunningAvgSamplesPerSec=2.6338691722740637, CurrSamplesPerSec=2.614611291286985, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:44:43,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=38930, skipped=0, lr=[6.442502770206449e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:44:43,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=38930/global_step=38930, RunningAvgSamplesPerSec=2.633867430844269, CurrSamplesPerSec=2.6527766729030784, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:44:59,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=38940, skipped=0, lr=[6.440889467658218e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:44:59,102] [INFO] [timer.py:259:stop] epoch=0/micro_step=38940/global_step=38940, RunningAvgSamplesPerSec=2.6338685661763135, CurrSamplesPerSec=2.62258935211767, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:45:14,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=38950, skipped=0, lr=[6.439276001496524e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:45:14,363] [INFO] [timer.py:259:stop] epoch=0/micro_step=38950/global_step=38950, RunningAvgSamplesPerSec=2.633868891690481, CurrSamplesPerSec=2.624453120560647, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:45:29,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=38960, skipped=0, lr=[6.437662371904573e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:45:29,672] [INFO] [timer.py:259:stop] epoch=0/micro_step=38960/global_step=38960, RunningAvgSamplesPerSec=2.633866857777803, CurrSamplesPerSec=2.578315579348686, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:45:44,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=38970, skipped=0, lr=[6.436048579065597e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:45:44,926] [INFO] [timer.py:259:stop] epoch=0/micro_step=38970/global_step=38970, RunningAvgSamplesPerSec=2.633867861059522, CurrSamplesPerSec=2.638252596802745, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:46:00,282] [INFO] [logging.py:96:log_dist] [Rank 0] step=38980, skipped=0, lr=[6.434434623162842e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:46:00,283] [INFO] [timer.py:259:stop] epoch=0/micro_step=38980/global_step=38980, RunningAvgSamplesPerSec=2.6338637611240188, CurrSamplesPerSec=2.597062834333343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:46:15,440] [INFO] [logging.py:96:log_dist] [Rank 0] step=38990, skipped=0, lr=[6.432820504379572e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:46:15,460] [INFO] [timer.py:259:stop] epoch=0/micro_step=38990/global_step=38990, RunningAvgSamplesPerSec=2.6338672819506694, CurrSamplesPerSec=2.6540104330860315, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:46:30,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=39000, skipped=0, lr=[6.4312062228990715e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:46:30,768] [INFO] [timer.py:259:stop] epoch=0/micro_step=39000/global_step=39000, RunningAvgSamplesPerSec=2.6338656988515283, CurrSamplesPerSec=2.6440077653841176, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:46:46,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=39010, skipped=0, lr=[6.429591778904643e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:46:46,032] [INFO] [timer.py:259:stop] epoch=0/micro_step=39010/global_step=39010, RunningAvgSamplesPerSec=2.6338657386697797, CurrSamplesPerSec=2.6457773506760196, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:47:01,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=39020, skipped=0, lr=[6.427977172579606e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:47:01,319] [INFO] [timer.py:259:stop] epoch=0/micro_step=39020/global_step=39020, RunningAvgSamplesPerSec=2.633865081546094, CurrSamplesPerSec=2.637987935794804, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:47:16,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=39030, skipped=0, lr=[6.4263624041073e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:47:16,586] [INFO] [timer.py:259:stop] epoch=0/micro_step=39030/global_step=39030, RunningAvgSamplesPerSec=2.6338648526602406, CurrSamplesPerSec=2.638365446483015, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:47:31,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=39040, skipped=0, lr=[6.424747473671083e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:47:31,887] [INFO] [timer.py:259:stop] epoch=0/micro_step=39040/global_step=39040, RunningAvgSamplesPerSec=2.6338635024497785, CurrSamplesPerSec=2.6365063176989816, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:47:47,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=39050, skipped=0, lr=[6.4231323814543315e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:47:47,181] [INFO] [timer.py:259:stop] epoch=0/micro_step=39050/global_step=39050, RunningAvgSamplesPerSec=2.6338624664392185, CurrSamplesPerSec=2.634558787828751, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:48:02,475] [INFO] [logging.py:96:log_dist] [Rank 0] step=39060, skipped=0, lr=[6.421517127640438e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:48:02,477] [INFO] [timer.py:259:stop] epoch=0/micro_step=39060/global_step=39060, RunningAvgSamplesPerSec=2.633861285087405, CurrSamplesPerSec=2.644804286067127, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:48:17,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=39070, skipped=0, lr=[6.419901712412816e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:48:17,771] [INFO] [timer.py:259:stop] epoch=0/micro_step=39070/global_step=39070, RunningAvgSamplesPerSec=2.633860156128275, CurrSamplesPerSec=2.6412871290799895, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:48:33,065] [INFO] [logging.py:96:log_dist] [Rank 0] step=39080, skipped=0, lr=[6.418286135954896e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:48:33,067] [INFO] [timer.py:259:stop] epoch=0/micro_step=39080/global_step=39080, RunningAvgSamplesPerSec=2.6338586901987195, CurrSamplesPerSec=2.6430160158146356, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:48:48,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=39090, skipped=0, lr=[6.416670398450128e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:48:48,333] [INFO] [timer.py:259:stop] epoch=0/micro_step=39090/global_step=39090, RunningAvgSamplesPerSec=2.633858252324089, CurrSamplesPerSec=2.6305304579713065, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:49:03,631] [INFO] [logging.py:96:log_dist] [Rank 0] step=39100, skipped=0, lr=[6.415054500081978e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:49:03,632] [INFO] [timer.py:259:stop] epoch=0/micro_step=39100/global_step=39100, RunningAvgSamplesPerSec=2.633856691923209, CurrSamplesPerSec=2.632434860117685, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:49:18,907] [INFO] [logging.py:96:log_dist] [Rank 0] step=39110, skipped=0, lr=[6.413438441033933e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:49:18,909] [INFO] [timer.py:259:stop] epoch=0/micro_step=39110/global_step=39110, RunningAvgSamplesPerSec=2.6338567172159486, CurrSamplesPerSec=2.611749107261961, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:49:34,096] [INFO] [logging.py:96:log_dist] [Rank 0] step=39120, skipped=0, lr=[6.411822221489499e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:49:34,097] [INFO] [timer.py:259:stop] epoch=0/micro_step=39120/global_step=39120, RunningAvgSamplesPerSec=2.633859865328689, CurrSamplesPerSec=2.658981834515286, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:49:49,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=39130, skipped=0, lr=[6.4102058416321944e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:49:49,323] [INFO] [timer.py:259:stop] epoch=0/micro_step=39130/global_step=39130, RunningAvgSamplesPerSec=2.633861553281657, CurrSamplesPerSec=2.613643503754443, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:50:04,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=39140, skipped=0, lr=[6.408589301645563e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:50:04,581] [INFO] [timer.py:259:stop] epoch=0/micro_step=39140/global_step=39140, RunningAvgSamplesPerSec=2.633862271620437, CurrSamplesPerSec=2.6478472310272974, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:50:19,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=39150, skipped=0, lr=[6.406972601713159e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:50:19,803] [INFO] [timer.py:259:stop] epoch=0/micro_step=39150/global_step=39150, RunningAvgSamplesPerSec=2.6338642599057707, CurrSamplesPerSec=2.632972752119222, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:50:35,028] [INFO] [logging.py:96:log_dist] [Rank 0] step=39160, skipped=0, lr=[6.405355742018564e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:50:35,030] [INFO] [timer.py:259:stop] epoch=0/micro_step=39160/global_step=39160, RunningAvgSamplesPerSec=2.63386614862594, CurrSamplesPerSec=2.6461312181346726, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:50:50,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=39170, skipped=0, lr=[6.403738722745371e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:50:50,316] [INFO] [timer.py:259:stop] epoch=0/micro_step=39170/global_step=39170, RunningAvgSamplesPerSec=2.6338662832632123, CurrSamplesPerSec=2.6279753664541188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:51:05,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=39180, skipped=0, lr=[6.402121544077192e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:51:05,586] [INFO] [timer.py:259:stop] epoch=0/micro_step=39180/global_step=39180, RunningAvgSamplesPerSec=2.6338664953523714, CurrSamplesPerSec=2.6230120882357393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:51:20,864] [INFO] [logging.py:96:log_dist] [Rank 0] step=39190, skipped=0, lr=[6.4005042061976596e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:51:20,865] [INFO] [timer.py:259:stop] epoch=0/micro_step=39190/global_step=39190, RunningAvgSamplesPerSec=2.6338653336541338, CurrSamplesPerSec=2.631294116580505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:51:36,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=39200, skipped=0, lr=[6.398886709290424e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:51:36,102] [INFO] [timer.py:259:stop] epoch=0/micro_step=39200/global_step=39200, RunningAvgSamplesPerSec=2.6338673993673916, CurrSamplesPerSec=2.64948011276142, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:51:51,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=39210, skipped=0, lr=[6.39726905353915e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:51:51,343] [INFO] [timer.py:259:stop] epoch=0/micro_step=39210/global_step=39210, RunningAvgSamplesPerSec=2.6338684432055093, CurrSamplesPerSec=2.6371308484851945, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:52:06,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=39220, skipped=0, lr=[6.395651239127524e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:52:06,586] [INFO] [timer.py:259:stop] epoch=0/micro_step=39220/global_step=39220, RunningAvgSamplesPerSec=2.633869555751033, CurrSamplesPerSec=2.631699435871134, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:52:21,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=39230, skipped=0, lr=[6.394033266239249e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:52:21,833] [INFO] [timer.py:259:stop] epoch=0/micro_step=39230/global_step=39230, RunningAvgSamplesPerSec=2.633871048558241, CurrSamplesPerSec=2.6460940742974524, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:52:37,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=39240, skipped=0, lr=[6.3924151350580475e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:52:37,094] [INFO] [timer.py:259:stop] epoch=0/micro_step=39240/global_step=39240, RunningAvgSamplesPerSec=2.6338723124772443, CurrSamplesPerSec=2.6366248190466393, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:52:52,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=39250, skipped=0, lr=[6.390796845767658e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:52:52,379] [INFO] [timer.py:259:stop] epoch=0/micro_step=39250/global_step=39250, RunningAvgSamplesPerSec=2.633872007913204, CurrSamplesPerSec=2.634496319118222, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:53:07,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=39260, skipped=0, lr=[6.389178398551836e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:53:07,632] [INFO] [timer.py:259:stop] epoch=0/micro_step=39260/global_step=39260, RunningAvgSamplesPerSec=2.633872593499755, CurrSamplesPerSec=2.6353645273115154, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:53:22,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=39270, skipped=0, lr=[6.387559793594361e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:53:22,899] [INFO] [timer.py:259:stop] epoch=0/micro_step=39270/global_step=39270, RunningAvgSamplesPerSec=2.6338725767425313, CurrSamplesPerSec=2.6386588178384756, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:53:38,160] [INFO] [logging.py:96:log_dist] [Rank 0] step=39280, skipped=0, lr=[6.3859410310790235e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:53:38,168] [INFO] [timer.py:259:stop] epoch=0/micro_step=39280/global_step=39280, RunningAvgSamplesPerSec=2.6338719058226783, CurrSamplesPerSec=2.6462860649151225, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:53:53,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=39290, skipped=0, lr=[6.384322111189634e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:53:53,421] [INFO] [timer.py:259:stop] epoch=0/micro_step=39290/global_step=39290, RunningAvgSamplesPerSec=2.6338723108893864, CurrSamplesPerSec=2.6168257208251227, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:54:08,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=39300, skipped=0, lr=[6.382703034110021e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:54:08,758] [INFO] [timer.py:259:stop] epoch=0/micro_step=39300/global_step=39300, RunningAvgSamplesPerSec=2.633869553096978, CurrSamplesPerSec=2.594566696668214, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:54:24,174] [INFO] [logging.py:96:log_dist] [Rank 0] step=39310, skipped=0, lr=[6.381083800024032e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:54:24,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=39310/global_step=39310, RunningAvgSamplesPerSec=2.633862742708409, CurrSamplesPerSec=2.5880556214447576, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:54:39,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=39320, skipped=0, lr=[6.379464409115534e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:54:39,555] [INFO] [timer.py:259:stop] epoch=0/micro_step=39320/global_step=39320, RunningAvgSamplesPerSec=2.6338587698185036, CurrSamplesPerSec=2.602877742376661, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:54:54,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=39330, skipped=0, lr=[6.377844861568407e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:54:54,801] [INFO] [timer.py:259:stop] epoch=0/micro_step=39330/global_step=39330, RunningAvgSamplesPerSec=2.6338606778580798, CurrSamplesPerSec=2.638392830581519, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:55:10,056] [INFO] [logging.py:96:log_dist] [Rank 0] step=39340, skipped=0, lr=[6.37622515756655e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:55:10,071] [INFO] [timer.py:259:stop] epoch=0/micro_step=39340/global_step=39340, RunningAvgSamplesPerSec=2.6338608100451824, CurrSamplesPerSec=2.634421443309222, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:55:25,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=39350, skipped=0, lr=[6.374605297293883e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:55:25,275] [INFO] [timer.py:259:stop] epoch=0/micro_step=39350/global_step=39350, RunningAvgSamplesPerSec=2.6338637153554694, CurrSamplesPerSec=2.6337890986745736, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:55:40,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=39360, skipped=0, lr=[6.372985280934341e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:55:40,592] [INFO] [timer.py:259:stop] epoch=0/micro_step=39360/global_step=39360, RunningAvgSamplesPerSec=2.633861995710777, CurrSamplesPerSec=2.6447771856428157, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:55:55,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=39370, skipped=0, lr=[6.3713651086718764e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:55:55,805] [INFO] [timer.py:259:stop] epoch=0/micro_step=39370/global_step=39370, RunningAvgSamplesPerSec=2.6338642205686376, CurrSamplesPerSec=2.6255086382073296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:56:11,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=39380, skipped=0, lr=[6.369744780690461e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:56:11,081] [INFO] [timer.py:259:stop] epoch=0/micro_step=39380/global_step=39380, RunningAvgSamplesPerSec=2.6338644624192633, CurrSamplesPerSec=2.6264050569473434, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:56:26,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=39390, skipped=0, lr=[6.368124297174085e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:56:26,329] [INFO] [timer.py:259:stop] epoch=0/micro_step=39390/global_step=39390, RunningAvgSamplesPerSec=2.633865702898255, CurrSamplesPerSec=2.627211164262614, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:56:41,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=39400, skipped=0, lr=[6.366503658306753e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:56:41,609] [INFO] [timer.py:259:stop] epoch=0/micro_step=39400/global_step=39400, RunningAvgSamplesPerSec=2.633865809805842, CurrSamplesPerSec=2.651059293700726, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:56:56,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=39410, skipped=0, lr=[6.364882864272489e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:56:56,864] [INFO] [timer.py:259:stop] epoch=0/micro_step=39410/global_step=39410, RunningAvgSamplesPerSec=2.6338662733865763, CurrSamplesPerSec=2.6358472965582074, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:57:12,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=39420, skipped=0, lr=[6.363261915255337e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:57:12,119] [INFO] [timer.py:259:stop] epoch=0/micro_step=39420/global_step=39420, RunningAvgSamplesPerSec=2.633866883601956, CurrSamplesPerSec=2.653631370367622, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:57:27,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=39430, skipped=0, lr=[6.361640811439353e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:57:27,312] [INFO] [timer.py:259:stop] epoch=0/micro_step=39430/global_step=39430, RunningAvgSamplesPerSec=2.6338702327171872, CurrSamplesPerSec=2.6375632617186135, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:57:42,515] [INFO] [logging.py:96:log_dist] [Rank 0] step=39440, skipped=0, lr=[6.360019553008616e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:57:42,534] [INFO] [timer.py:259:stop] epoch=0/micro_step=39440/global_step=39440, RunningAvgSamplesPerSec=2.6338720266916793, CurrSamplesPerSec=2.6563981185924295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:57:57,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=39450, skipped=0, lr=[6.358398140147218e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:57:57,742] [INFO] [timer.py:259:stop] epoch=0/micro_step=39450/global_step=39450, RunningAvgSamplesPerSec=2.6338744672022543, CurrSamplesPerSec=2.652747311641167, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:58:12,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=39460, skipped=0, lr=[6.356776573039275e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:58:12,971] [INFO] [timer.py:259:stop] epoch=0/micro_step=39460/global_step=39460, RunningAvgSamplesPerSec=2.633875808680899, CurrSamplesPerSec=2.658710470418192, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:58:28,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=39470, skipped=0, lr=[6.355154851868915e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:58:28,171] [INFO] [timer.py:259:stop] epoch=0/micro_step=39470/global_step=39470, RunningAvgSamplesPerSec=2.6338786923281297, CurrSamplesPerSec=2.6476963799513085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:58:43,429] [INFO] [logging.py:96:log_dist] [Rank 0] step=39480, skipped=0, lr=[6.353532976820282e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:58:43,431] [INFO] [timer.py:259:stop] epoch=0/micro_step=39480/global_step=39480, RunningAvgSamplesPerSec=2.633879173876142, CurrSamplesPerSec=2.61463573964407, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:58:58,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=39490, skipped=0, lr=[6.351910948077545e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:58:58,607] [INFO] [timer.py:259:stop] epoch=0/micro_step=39490/global_step=39490, RunningAvgSamplesPerSec=2.6338825894724667, CurrSamplesPerSec=2.6298867877831484, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:59:13,870] [INFO] [logging.py:96:log_dist] [Rank 0] step=39500, skipped=0, lr=[6.350288765824881e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:59:13,873] [INFO] [timer.py:259:stop] epoch=0/micro_step=39500/global_step=39500, RunningAvgSamplesPerSec=2.63388283072531, CurrSamplesPerSec=2.638025682002103, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:59:29,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=39510, skipped=0, lr=[6.348666430246493e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:59:29,111] [INFO] [timer.py:259:stop] epoch=0/micro_step=39510/global_step=39510, RunningAvgSamplesPerSec=2.6338841989276665, CurrSamplesPerSec=2.628719007418571, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:59:44,383] [INFO] [logging.py:96:log_dist] [Rank 0] step=39520, skipped=0, lr=[6.3470439415265974e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:59:44,384] [INFO] [timer.py:259:stop] epoch=0/micro_step=39520/global_step=39520, RunningAvgSamplesPerSec=2.63388353058998, CurrSamplesPerSec=2.6430580698792525, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 16:59:59,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=39530, skipped=0, lr=[6.3454212998494255e-06], mom=[(0.9, 0.95)] +[2024-11-01 16:59:59,643] [INFO] [timer.py:259:stop] epoch=0/micro_step=39530/global_step=39530, RunningAvgSamplesPerSec=2.633883407445784, CurrSamplesPerSec=2.6672189820738357, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:00:14,925] [INFO] [logging.py:96:log_dist] [Rank 0] step=39540, skipped=0, lr=[6.343798505399234e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:00:14,931] [INFO] [timer.py:259:stop] epoch=0/micro_step=39540/global_step=39540, RunningAvgSamplesPerSec=2.6338827357667514, CurrSamplesPerSec=2.6261008389324276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:00:30,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=39550, skipped=0, lr=[6.342175558360286e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:00:30,212] [INFO] [timer.py:259:stop] epoch=0/micro_step=39550/global_step=39550, RunningAvgSamplesPerSec=2.6338829975469746, CurrSamplesPerSec=2.62707828701541, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:00:45,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=39560, skipped=0, lr=[6.340552458916871e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:00:45,437] [INFO] [timer.py:259:stop] epoch=0/micro_step=39560/global_step=39560, RunningAvgSamplesPerSec=2.6338845158908097, CurrSamplesPerSec=2.6514988013810274, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:01:00,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=39570, skipped=0, lr=[6.338929207253292e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:01:00,721] [INFO] [timer.py:259:stop] epoch=0/micro_step=39570/global_step=39570, RunningAvgSamplesPerSec=2.63388315039352, CurrSamplesPerSec=2.6341749209999903, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:01:15,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=39580, skipped=0, lr=[6.337305803553869e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:01:15,922] [INFO] [timer.py:259:stop] epoch=0/micro_step=39580/global_step=39580, RunningAvgSamplesPerSec=2.6338856107304216, CurrSamplesPerSec=2.6469394590684283, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:01:31,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=39590, skipped=0, lr=[6.335682248002939e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:01:31,175] [INFO] [timer.py:259:stop] epoch=0/micro_step=39590/global_step=39590, RunningAvgSamplesPerSec=2.633886362949653, CurrSamplesPerSec=2.6539659306342234, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:01:46,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=39600, skipped=0, lr=[6.334058540784861e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:01:46,563] [INFO] [timer.py:259:stop] epoch=0/micro_step=39600/global_step=39600, RunningAvgSamplesPerSec=2.6338810647782496, CurrSamplesPerSec=2.547016111883359, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:02:02,005] [INFO] [logging.py:96:log_dist] [Rank 0] step=39610, skipped=0, lr=[6.332434682084005e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:02:02,007] [INFO] [timer.py:259:stop] epoch=0/micro_step=39610/global_step=39610, RunningAvgSamplesPerSec=2.633876135089494, CurrSamplesPerSec=2.6352560734763486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:02:17,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=39620, skipped=0, lr=[6.33081067208476e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:02:17,306] [INFO] [timer.py:259:stop] epoch=0/micro_step=39620/global_step=39620, RunningAvgSamplesPerSec=2.6338749668381722, CurrSamplesPerSec=2.619454046874089, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:02:32,893] [INFO] [logging.py:96:log_dist] [Rank 0] step=39630, skipped=0, lr=[6.329186510971535e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:02:32,895] [INFO] [timer.py:259:stop] epoch=0/micro_step=39630/global_step=39630, RunningAvgSamplesPerSec=2.63386235200186, CurrSamplesPerSec=2.613576730036677, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:02:48,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=39640, skipped=0, lr=[6.327562198928749e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:02:48,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=39640/global_step=39640, RunningAvgSamplesPerSec=2.6338558455802232, CurrSamplesPerSec=2.6270371512735844, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:03:03,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=39650, skipped=0, lr=[6.32593773614085e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:03:03,629] [INFO] [timer.py:259:stop] epoch=0/micro_step=39650/global_step=39650, RunningAvgSamplesPerSec=2.633853975219682, CurrSamplesPerSec=2.6304162154758752, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:03:18,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=39660, skipped=0, lr=[6.324313122792294e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:03:18,920] [INFO] [timer.py:259:stop] epoch=0/micro_step=39660/global_step=39660, RunningAvgSamplesPerSec=2.633852628498641, CurrSamplesPerSec=2.6120524490760255, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:03:34,165] [INFO] [logging.py:96:log_dist] [Rank 0] step=39670, skipped=0, lr=[6.322688359067554e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:03:34,186] [INFO] [timer.py:259:stop] epoch=0/micro_step=39670/global_step=39670, RunningAvgSamplesPerSec=2.6338521748216186, CurrSamplesPerSec=2.636169931358901, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:03:49,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=39680, skipped=0, lr=[6.321063445151124e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:03:49,559] [INFO] [timer.py:259:stop] epoch=0/micro_step=39680/global_step=39680, RunningAvgSamplesPerSec=2.6338479977162934, CurrSamplesPerSec=2.601791516699435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:04:04,877] [INFO] [logging.py:96:log_dist] [Rank 0] step=39690, skipped=0, lr=[6.319438381227514e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:04:04,878] [INFO] [timer.py:259:stop] epoch=0/micro_step=39690/global_step=39690, RunningAvgSamplesPerSec=2.6338457906826145, CurrSamplesPerSec=2.625337726400412, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:04:20,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=39700, skipped=0, lr=[6.317813167481249e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:04:20,237] [INFO] [timer.py:259:stop] epoch=0/micro_step=39700/global_step=39700, RunningAvgSamplesPerSec=2.6338410952308955, CurrSamplesPerSec=2.5849927356621296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:04:35,485] [INFO] [logging.py:96:log_dist] [Rank 0] step=39710, skipped=0, lr=[6.316187804096875e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:04:35,493] [INFO] [timer.py:259:stop] epoch=0/micro_step=39710/global_step=39710, RunningAvgSamplesPerSec=2.6338411348361572, CurrSamplesPerSec=2.6420974060517706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:04:50,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=39720, skipped=0, lr=[6.31456229125895e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:04:50,776] [INFO] [timer.py:259:stop] epoch=0/micro_step=39720/global_step=39720, RunningAvgSamplesPerSec=2.6338401372750844, CurrSamplesPerSec=2.6312128201080647, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:05:06,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=39730, skipped=0, lr=[6.312936629152053e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:05:06,055] [INFO] [timer.py:259:stop] epoch=0/micro_step=39730/global_step=39730, RunningAvgSamplesPerSec=2.6338390937250034, CurrSamplesPerSec=2.6266780909532463, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:05:21,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=39740, skipped=0, lr=[6.311310817960778e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:05:21,345] [INFO] [timer.py:259:stop] epoch=0/micro_step=39740/global_step=39740, RunningAvgSamplesPerSec=2.6338378590572096, CurrSamplesPerSec=2.6345265188534106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:05:36,573] [INFO] [logging.py:96:log_dist] [Rank 0] step=39750, skipped=0, lr=[6.309684857869738e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:05:36,585] [INFO] [timer.py:259:stop] epoch=0/micro_step=39750/global_step=39750, RunningAvgSamplesPerSec=2.6338386729552443, CurrSamplesPerSec=2.6265485572017435, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:05:51,946] [INFO] [logging.py:96:log_dist] [Rank 0] step=39760, skipped=0, lr=[6.308058749063557e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:05:51,948] [INFO] [timer.py:259:stop] epoch=0/micro_step=39760/global_step=39760, RunningAvgSamplesPerSec=2.6338342271230366, CurrSamplesPerSec=2.6080552995205735, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:06:07,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=39770, skipped=0, lr=[6.306432491726885e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:06:07,185] [INFO] [timer.py:259:stop] epoch=0/micro_step=39770/global_step=39770, RunningAvgSamplesPerSec=2.6338351957157897, CurrSamplesPerSec=2.6376648558651357, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:06:22,479] [INFO] [logging.py:96:log_dist] [Rank 0] step=39780, skipped=0, lr=[6.304806086044381e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:06:22,480] [INFO] [timer.py:259:stop] epoch=0/micro_step=39780/global_step=39780, RunningAvgSamplesPerSec=2.6338340352151937, CurrSamplesPerSec=2.6432271323946264, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:06:37,755] [INFO] [logging.py:96:log_dist] [Rank 0] step=39790, skipped=0, lr=[6.303179532200724e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:06:37,756] [INFO] [timer.py:259:stop] epoch=0/micro_step=39790/global_step=39790, RunningAvgSamplesPerSec=2.633833005221332, CurrSamplesPerSec=2.636829528284922, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:06:53,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=39800, skipped=0, lr=[6.301552830380611e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:06:53,082] [INFO] [timer.py:259:stop] epoch=0/micro_step=39800/global_step=39800, RunningAvgSamplesPerSec=2.6338300256262612, CurrSamplesPerSec=2.629883489836293, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:07:08,340] [INFO] [logging.py:96:log_dist] [Rank 0] step=39810, skipped=0, lr=[6.2999259807687555e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:07:08,367] [INFO] [timer.py:259:stop] epoch=0/micro_step=39810/global_step=39810, RunningAvgSamplesPerSec=2.6338291655834296, CurrSamplesPerSec=2.606550813887912, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:07:23,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=39820, skipped=0, lr=[6.298298983549885e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:07:23,663] [INFO] [timer.py:259:stop] epoch=0/micro_step=39820/global_step=39820, RunningAvgSamplesPerSec=2.633827871288064, CurrSamplesPerSec=2.6349738042040385, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:07:38,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=39830, skipped=0, lr=[6.296671838908746e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:07:38,897] [INFO] [timer.py:259:stop] epoch=0/micro_step=39830/global_step=39830, RunningAvgSamplesPerSec=2.633829131768561, CurrSamplesPerSec=2.637001939668644, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:07:54,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=39840, skipped=0, lr=[6.295044547030099e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:07:54,230] [INFO] [timer.py:259:stop] epoch=0/micro_step=39840/global_step=39840, RunningAvgSamplesPerSec=2.633826549324744, CurrSamplesPerSec=2.614443017499926, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:08:09,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=39850, skipped=0, lr=[6.293417108098727e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:08:09,524] [INFO] [timer.py:259:stop] epoch=0/micro_step=39850/global_step=39850, RunningAvgSamplesPerSec=2.633824783083262, CurrSamplesPerSec=2.5845889332894436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:08:24,752] [INFO] [logging.py:96:log_dist] [Rank 0] step=39860, skipped=0, lr=[6.2917895222994265e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:08:24,774] [INFO] [timer.py:259:stop] epoch=0/micro_step=39860/global_step=39860, RunningAvgSamplesPerSec=2.6338250888089045, CurrSamplesPerSec=2.6436832093403653, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:08:40,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=39870, skipped=0, lr=[6.290161789817008e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:08:40,058] [INFO] [timer.py:259:stop] epoch=0/micro_step=39870/global_step=39870, RunningAvgSamplesPerSec=2.6338241441319368, CurrSamplesPerSec=2.60154501202901, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:08:55,326] [INFO] [logging.py:96:log_dist] [Rank 0] step=39880, skipped=0, lr=[6.288533910836301e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:08:55,328] [INFO] [timer.py:259:stop] epoch=0/micro_step=39880/global_step=39880, RunningAvgSamplesPerSec=2.6338236164256474, CurrSamplesPerSec=2.6393674106041765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:09:10,580] [INFO] [logging.py:96:log_dist] [Rank 0] step=39890, skipped=0, lr=[6.2869058855421536e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:09:10,603] [INFO] [timer.py:259:stop] epoch=0/micro_step=39890/global_step=39890, RunningAvgSamplesPerSec=2.6338224660294047, CurrSamplesPerSec=2.651951449649803, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:09:25,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=39900, skipped=0, lr=[6.285277714119427e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:09:25,873] [INFO] [timer.py:259:stop] epoch=0/micro_step=39900/global_step=39900, RunningAvgSamplesPerSec=2.6338220550270575, CurrSamplesPerSec=2.635981476535754, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:09:41,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=39910, skipped=0, lr=[6.283649396753001e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:09:41,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=39910/global_step=39910, RunningAvgSamplesPerSec=2.6338211767727797, CurrSamplesPerSec=2.6332583125181483, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:09:56,459] [INFO] [logging.py:96:log_dist] [Rank 0] step=39920, skipped=0, lr=[6.282020933627772e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:09:56,460] [INFO] [timer.py:259:stop] epoch=0/micro_step=39920/global_step=39920, RunningAvgSamplesPerSec=2.6338192243187066, CurrSamplesPerSec=2.64112912476401, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:10:11,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=39930, skipped=0, lr=[6.280392324928653e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:10:11,783] [INFO] [timer.py:259:stop] epoch=0/micro_step=39930/global_step=39930, RunningAvgSamplesPerSec=2.6338164002754776, CurrSamplesPerSec=2.6205272399380086, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:10:27,095] [INFO] [logging.py:96:log_dist] [Rank 0] step=39940, skipped=0, lr=[6.2787635708405715e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:10:27,096] [INFO] [timer.py:259:stop] epoch=0/micro_step=39940/global_step=39940, RunningAvgSamplesPerSec=2.6338142109732954, CurrSamplesPerSec=2.6290510228932873, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:10:42,412] [INFO] [logging.py:96:log_dist] [Rank 0] step=39950, skipped=0, lr=[6.277134671548475e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:10:42,420] [INFO] [timer.py:259:stop] epoch=0/micro_step=39950/global_step=39950, RunningAvgSamplesPerSec=2.633811068438777, CurrSamplesPerSec=2.6399779273554707, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:10:57,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=39960, skipped=0, lr=[6.275505627237323e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:10:57,735] [INFO] [timer.py:259:stop] epoch=0/micro_step=39960/global_step=39960, RunningAvgSamplesPerSec=2.6338090347219496, CurrSamplesPerSec=2.632884327739599, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:11:13,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=39970, skipped=0, lr=[6.273876438092096e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:11:13,065] [INFO] [timer.py:259:stop] epoch=0/micro_step=39970/global_step=39970, RunningAvgSamplesPerSec=2.633806161723041, CurrSamplesPerSec=2.638520215602717, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:11:28,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=39980, skipped=0, lr=[6.2722471042977885e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:11:28,343] [INFO] [timer.py:259:stop] epoch=0/micro_step=39980/global_step=39980, RunningAvgSamplesPerSec=2.6338051004381504, CurrSamplesPerSec=2.6224573518829333, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:11:43,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=39990, skipped=0, lr=[6.270617626039411e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:11:43,667] [INFO] [timer.py:259:stop] epoch=0/micro_step=39990/global_step=39990, RunningAvgSamplesPerSec=2.6338026745882, CurrSamplesPerSec=2.620159317585493, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:11:58,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=40000, skipped=0, lr=[6.268988003501993e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:11:58,936] [INFO] [timer.py:259:stop] epoch=0/micro_step=40000/global_step=40000, RunningAvgSamplesPerSec=2.6338022612047447, CurrSamplesPerSec=2.6189686780120933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:12:14,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=40010, skipped=0, lr=[6.267358236870576e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:12:14,311] [INFO] [timer.py:259:stop] epoch=0/micro_step=40010/global_step=40010, RunningAvgSamplesPerSec=2.6337981781058164, CurrSamplesPerSec=2.632894244196764, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:12:29,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=40020, skipped=0, lr=[6.265728326330224e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:12:29,670] [INFO] [timer.py:259:stop] epoch=0/micro_step=40020/global_step=40020, RunningAvgSamplesPerSec=2.6337936301425753, CurrSamplesPerSec=2.632207293044296, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:12:44,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=40030, skipped=0, lr=[6.264098272066011e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:12:44,974] [INFO] [timer.py:259:stop] epoch=0/micro_step=40030/global_step=40030, RunningAvgSamplesPerSec=2.6337918940769707, CurrSamplesPerSec=2.6247446378583565, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:13:00,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=40040, skipped=0, lr=[6.2624680742630315e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:13:00,284] [INFO] [timer.py:259:stop] epoch=0/micro_step=40040/global_step=40040, RunningAvgSamplesPerSec=2.6337897564931736, CurrSamplesPerSec=2.603421397397979, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:13:15,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=40050, skipped=0, lr=[6.260837733106395e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:13:15,523] [INFO] [timer.py:259:stop] epoch=0/micro_step=40050/global_step=40050, RunningAvgSamplesPerSec=2.6337906897711285, CurrSamplesPerSec=2.6606833354930925, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:13:30,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=40060, skipped=0, lr=[6.259207248781227e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:13:30,792] [INFO] [timer.py:259:stop] epoch=0/micro_step=40060/global_step=40060, RunningAvgSamplesPerSec=2.6337905144403257, CurrSamplesPerSec=2.6293909519048495, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:13:46,073] [INFO] [logging.py:96:log_dist] [Rank 0] step=40070, skipped=0, lr=[6.25757662147267e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:13:46,075] [INFO] [timer.py:259:stop] epoch=0/micro_step=40070/global_step=40070, RunningAvgSamplesPerSec=2.633789511506879, CurrSamplesPerSec=2.628716124276798, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:14:01,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=40080, skipped=0, lr=[6.255945851365881e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:14:01,399] [INFO] [timer.py:259:stop] epoch=0/micro_step=40080/global_step=40080, RunningAvgSamplesPerSec=2.633786854822019, CurrSamplesPerSec=2.658697830583741, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:14:16,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=40090, skipped=0, lr=[6.254314938646036e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:14:16,649] [INFO] [timer.py:259:stop] epoch=0/micro_step=40090/global_step=40090, RunningAvgSamplesPerSec=2.6337869620485157, CurrSamplesPerSec=2.632320452151692, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:14:31,996] [INFO] [logging.py:96:log_dist] [Rank 0] step=40100, skipped=0, lr=[6.252683883498325e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:14:31,998] [INFO] [timer.py:259:stop] epoch=0/micro_step=40100/global_step=40100, RunningAvgSamplesPerSec=2.6337840854179895, CurrSamplesPerSec=2.613198952778292, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:14:47,279] [INFO] [logging.py:96:log_dist] [Rank 0] step=40110, skipped=0, lr=[6.251052686107954e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:14:47,295] [INFO] [timer.py:259:stop] epoch=0/micro_step=40110/global_step=40110, RunningAvgSamplesPerSec=2.6337828846581472, CurrSamplesPerSec=2.6181271714769996, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:15:02,624] [INFO] [logging.py:96:log_dist] [Rank 0] step=40120, skipped=0, lr=[6.249421346660147e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:15:02,626] [INFO] [timer.py:259:stop] epoch=0/micro_step=40120/global_step=40120, RunningAvgSamplesPerSec=2.6337800564012066, CurrSamplesPerSec=2.61858566212802, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:15:17,924] [INFO] [logging.py:96:log_dist] [Rank 0] step=40130, skipped=0, lr=[6.247789865340143e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:15:17,925] [INFO] [timer.py:259:stop] epoch=0/micro_step=40130/global_step=40130, RunningAvgSamplesPerSec=2.633778119516531, CurrSamplesPerSec=2.630869532032315, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:15:33,204] [INFO] [logging.py:96:log_dist] [Rank 0] step=40140, skipped=0, lr=[6.246158242333196e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:15:33,206] [INFO] [timer.py:259:stop] epoch=0/micro_step=40140/global_step=40140, RunningAvgSamplesPerSec=2.633776956787645, CurrSamplesPerSec=2.6370259795851676, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:15:48,452] [INFO] [logging.py:96:log_dist] [Rank 0] step=40150, skipped=0, lr=[6.244526477824579e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:15:48,453] [INFO] [timer.py:259:stop] epoch=0/micro_step=40150/global_step=40150, RunningAvgSamplesPerSec=2.6337778207769094, CurrSamplesPerSec=2.6394425678307343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:16:03,738] [INFO] [logging.py:96:log_dist] [Rank 0] step=40160, skipped=0, lr=[6.242894571999582e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:16:03,739] [INFO] [timer.py:259:stop] epoch=0/micro_step=40160/global_step=40160, RunningAvgSamplesPerSec=2.633776865341246, CurrSamplesPerSec=2.631838973842765, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:16:19,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=40170, skipped=0, lr=[6.241262525043502e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:16:19,075] [INFO] [timer.py:259:stop] epoch=0/micro_step=40170/global_step=40170, RunningAvgSamplesPerSec=2.633773707674944, CurrSamplesPerSec=2.582248252230997, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:16:34,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=40180, skipped=0, lr=[6.239630337141661e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:16:34,410] [INFO] [timer.py:259:stop] epoch=0/micro_step=40180/global_step=40180, RunningAvgSamplesPerSec=2.6337708078178315, CurrSamplesPerSec=2.6470785296181285, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:16:49,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=40190, skipped=0, lr=[6.237998008479397e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:16:49,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=40190/global_step=40190, RunningAvgSamplesPerSec=2.6337705894128334, CurrSamplesPerSec=2.599243196452769, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:17:04,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=40200, skipped=0, lr=[6.236365539242059e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:17:04,933] [INFO] [timer.py:259:stop] epoch=0/micro_step=40200/global_step=40200, RunningAvgSamplesPerSec=2.6337707731926927, CurrSamplesPerSec=2.6309062496752995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:17:20,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=40210, skipped=0, lr=[6.234732929615015e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:17:20,246] [INFO] [timer.py:259:stop] epoch=0/micro_step=40210/global_step=40210, RunningAvgSamplesPerSec=2.6337681881209303, CurrSamplesPerSec=2.602586620842424, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:17:35,521] [INFO] [logging.py:96:log_dist] [Rank 0] step=40220, skipped=0, lr=[6.233100179783649e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:17:35,529] [INFO] [timer.py:259:stop] epoch=0/micro_step=40220/global_step=40220, RunningAvgSamplesPerSec=2.6337672194573734, CurrSamplesPerSec=2.6408838397308854, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:17:50,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=40230, skipped=0, lr=[6.231467289933358e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:17:50,850] [INFO] [timer.py:259:stop] epoch=0/micro_step=40230/global_step=40230, RunningAvgSamplesPerSec=2.6337647287233747, CurrSamplesPerSec=2.6203594316115093, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:18:06,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=40240, skipped=0, lr=[6.229834260249561e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:18:06,111] [INFO] [timer.py:259:stop] epoch=0/micro_step=40240/global_step=40240, RunningAvgSamplesPerSec=2.633764379004654, CurrSamplesPerSec=2.638000794271584, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:18:21,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=40250, skipped=0, lr=[6.228201090917685e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:18:21,401] [INFO] [timer.py:259:stop] epoch=0/micro_step=40250/global_step=40250, RunningAvgSamplesPerSec=2.6337633799817013, CurrSamplesPerSec=2.634712696894588, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:18:36,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=40260, skipped=0, lr=[6.22656778212318e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:18:36,632] [INFO] [timer.py:259:stop] epoch=0/micro_step=40260/global_step=40260, RunningAvgSamplesPerSec=2.633764187583526, CurrSamplesPerSec=2.642531033479188, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:18:51,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=40270, skipped=0, lr=[6.224934334051507e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:18:51,964] [INFO] [timer.py:259:stop] epoch=0/micro_step=40270/global_step=40270, RunningAvgSamplesPerSec=2.6337608750732207, CurrSamplesPerSec=2.6175528545421005, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:19:07,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=40280, skipped=0, lr=[6.223300746888145e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:19:07,192] [INFO] [timer.py:259:stop] epoch=0/micro_step=40280/global_step=40280, RunningAvgSamplesPerSec=2.6337625168557057, CurrSamplesPerSec=2.632787233042995, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:19:22,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=40290, skipped=0, lr=[6.221667020818589e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:19:22,483] [INFO] [timer.py:259:stop] epoch=0/micro_step=40290/global_step=40290, RunningAvgSamplesPerSec=2.6337610176833763, CurrSamplesPerSec=2.6427770408992495, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:19:37,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=40300, skipped=0, lr=[6.220033156028348e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:19:37,782] [INFO] [timer.py:259:stop] epoch=0/micro_step=40300/global_step=40300, RunningAvgSamplesPerSec=2.633759607636741, CurrSamplesPerSec=2.6301275602467675, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:19:53,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=40310, skipped=0, lr=[6.218399152702948e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:19:53,088] [INFO] [timer.py:259:stop] epoch=0/micro_step=40310/global_step=40310, RunningAvgSamplesPerSec=2.633757478924156, CurrSamplesPerSec=2.6233578408763343, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:20:08,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=40320, skipped=0, lr=[6.216765011027932e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:20:08,375] [INFO] [timer.py:259:stop] epoch=0/micro_step=40320/global_step=40320, RunningAvgSamplesPerSec=2.633756401774436, CurrSamplesPerSec=2.6235933165495564, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:20:23,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=40330, skipped=0, lr=[6.215130731188855e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:20:23,659] [INFO] [timer.py:259:stop] epoch=0/micro_step=40330/global_step=40330, RunningAvgSamplesPerSec=2.633755337073045, CurrSamplesPerSec=2.6228185393659835, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:20:38,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=40340, skipped=0, lr=[6.213496313371292e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:20:38,943] [INFO] [timer.py:259:stop] epoch=0/micro_step=40340/global_step=40340, RunningAvgSamplesPerSec=2.63375398413225, CurrSamplesPerSec=2.619059440990318, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:20:54,255] [INFO] [logging.py:96:log_dist] [Rank 0] step=40350, skipped=0, lr=[6.211861757760831e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:20:54,257] [INFO] [timer.py:259:stop] epoch=0/micro_step=40350/global_step=40350, RunningAvgSamplesPerSec=2.6337523191681607, CurrSamplesPerSec=2.633781656268309, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:21:09,551] [INFO] [logging.py:96:log_dist] [Rank 0] step=40360, skipped=0, lr=[6.210227064543077e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:21:09,566] [INFO] [timer.py:259:stop] epoch=0/micro_step=40360/global_step=40360, RunningAvgSamplesPerSec=2.633750605631269, CurrSamplesPerSec=2.609058308453933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:21:24,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=40370, skipped=0, lr=[6.20859223390365e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:21:24,832] [INFO] [timer.py:259:stop] epoch=0/micro_step=40370/global_step=40370, RunningAvgSamplesPerSec=2.6337502377215194, CurrSamplesPerSec=2.6347892443888226, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:21:40,084] [INFO] [logging.py:96:log_dist] [Rank 0] step=40380, skipped=0, lr=[6.2069572660281814e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:21:40,122] [INFO] [timer.py:259:stop] epoch=0/micro_step=40380/global_step=40380, RunningAvgSamplesPerSec=2.633748680057232, CurrSamplesPerSec=2.611938179343514, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:21:55,390] [INFO] [logging.py:96:log_dist] [Rank 0] step=40390, skipped=0, lr=[6.205322161102329e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:21:55,409] [INFO] [timer.py:259:stop] epoch=0/micro_step=40390/global_step=40390, RunningAvgSamplesPerSec=2.6337474509061476, CurrSamplesPerSec=2.6453972989745766, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:22:10,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=40400, skipped=0, lr=[6.203686919311756e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:22:10,717] [INFO] [timer.py:259:stop] epoch=0/micro_step=40400/global_step=40400, RunningAvgSamplesPerSec=2.633745236308647, CurrSamplesPerSec=2.6255094599524593, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:22:25,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=40410, skipped=0, lr=[6.202051540842143e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:22:25,991] [INFO] [timer.py:259:stop] epoch=0/micro_step=40410/global_step=40410, RunningAvgSamplesPerSec=2.633744616483258, CurrSamplesPerSec=2.623934297879276, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:22:41,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=40420, skipped=0, lr=[6.200416025879191e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:22:41,287] [INFO] [timer.py:259:stop] epoch=0/micro_step=40420/global_step=40420, RunningAvgSamplesPerSec=2.633742992525053, CurrSamplesPerSec=2.619827498506316, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:22:56,540] [INFO] [logging.py:96:log_dist] [Rank 0] step=40430, skipped=0, lr=[6.198780374608612e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:22:56,542] [INFO] [timer.py:259:stop] epoch=0/micro_step=40430/global_step=40430, RunningAvgSamplesPerSec=2.633743192768775, CurrSamplesPerSec=2.638201568683231, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:23:11,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=40440, skipped=0, lr=[6.1971445872161336e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:23:11,860] [INFO] [timer.py:259:stop] epoch=0/micro_step=40440/global_step=40440, RunningAvgSamplesPerSec=2.633741320707576, CurrSamplesPerSec=2.6435657388613203, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:23:27,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=40450, skipped=0, lr=[6.195508663887502e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:23:27,127] [INFO] [timer.py:259:stop] epoch=0/micro_step=40450/global_step=40450, RunningAvgSamplesPerSec=2.633741226375341, CurrSamplesPerSec=2.627803721764195, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:23:42,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=40460, skipped=0, lr=[6.193872604808475e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:23:42,406] [INFO] [timer.py:259:stop] epoch=0/micro_step=40460/global_step=40460, RunningAvgSamplesPerSec=2.6337408229145676, CurrSamplesPerSec=2.639166873870825, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:23:57,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=40470, skipped=0, lr=[6.192236410164828e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:23:57,643] [INFO] [timer.py:259:stop] epoch=0/micro_step=40470/global_step=40470, RunningAvgSamplesPerSec=2.6337420372576155, CurrSamplesPerSec=2.647092729835808, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:24:12,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=40480, skipped=0, lr=[6.190600080142352e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:24:12,900] [INFO] [timer.py:259:stop] epoch=0/micro_step=40480/global_step=40480, RunningAvgSamplesPerSec=2.633742031631335, CurrSamplesPerSec=2.622836580849984, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:24:28,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=40490, skipped=0, lr=[6.188963614926854e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:24:28,134] [INFO] [timer.py:259:stop] epoch=0/micro_step=40490/global_step=40490, RunningAvgSamplesPerSec=2.6337427180175825, CurrSamplesPerSec=2.649609826025384, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:24:43,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=40500, skipped=0, lr=[6.1873270147041546e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:24:43,495] [INFO] [timer.py:259:stop] epoch=0/micro_step=40500/global_step=40500, RunningAvgSamplesPerSec=2.633738512173613, CurrSamplesPerSec=2.6235334180316956, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:24:58,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=40510, skipped=0, lr=[6.185690279660087e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:24:58,838] [INFO] [timer.py:259:stop] epoch=0/micro_step=40510/global_step=40510, RunningAvgSamplesPerSec=2.6337354485641615, CurrSamplesPerSec=2.5964261946284064, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:25:14,119] [INFO] [logging.py:96:log_dist] [Rank 0] step=40520, skipped=0, lr=[6.184053409980506e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:25:14,120] [INFO] [timer.py:259:stop] epoch=0/micro_step=40520/global_step=40520, RunningAvgSamplesPerSec=2.63373446239087, CurrSamplesPerSec=2.637075718735075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:25:29,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=40530, skipped=0, lr=[6.1824164058512795e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:25:29,371] [INFO] [timer.py:259:stop] epoch=0/micro_step=40530/global_step=40530, RunningAvgSamplesPerSec=2.6337348507805167, CurrSamplesPerSec=2.610255790395723, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:25:44,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=40540, skipped=0, lr=[6.180779267458287e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:25:44,644] [INFO] [timer.py:259:stop] epoch=0/micro_step=40540/global_step=40540, RunningAvgSamplesPerSec=2.633734460032531, CurrSamplesPerSec=2.6436944570615286, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:26:00,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=40550, skipped=0, lr=[6.179141994987429e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:26:00,015] [INFO] [timer.py:259:stop] epoch=0/micro_step=40550/global_step=40550, RunningAvgSamplesPerSec=2.633729955694214, CurrSamplesPerSec=2.573277416356813, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:26:15,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=40560, skipped=0, lr=[6.1775045886246155e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:26:15,297] [INFO] [timer.py:259:stop] epoch=0/micro_step=40560/global_step=40560, RunningAvgSamplesPerSec=2.6337287774926734, CurrSamplesPerSec=2.6358224499132854, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:26:30,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=40570, skipped=0, lr=[6.175867048555777e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:26:30,568] [INFO] [timer.py:259:stop] epoch=0/micro_step=40570/global_step=40570, RunningAvgSamplesPerSec=2.6337281692622163, CurrSamplesPerSec=2.632595130597329, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:26:45,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=40580, skipped=0, lr=[6.174229374966856e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:26:45,859] [INFO] [timer.py:259:stop] epoch=0/micro_step=40580/global_step=40580, RunningAvgSamplesPerSec=2.633726801651089, CurrSamplesPerSec=2.6408564038994573, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:27:01,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=40590, skipped=0, lr=[6.17259156804381e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:27:01,171] [INFO] [timer.py:259:stop] epoch=0/micro_step=40590/global_step=40590, RunningAvgSamplesPerSec=2.6337248179759714, CurrSamplesPerSec=2.6301143660934, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:27:16,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=40600, skipped=0, lr=[6.1709536279726135e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:27:16,434] [INFO] [timer.py:259:stop] epoch=0/micro_step=40600/global_step=40600, RunningAvgSamplesPerSec=2.633724332004608, CurrSamplesPerSec=2.639302222443113, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:27:31,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=40610, skipped=0, lr=[6.1693155549392535e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:27:31,707] [INFO] [timer.py:259:stop] epoch=0/micro_step=40610/global_step=40610, RunningAvgSamplesPerSec=2.6337236310983503, CurrSamplesPerSec=2.6327372423963364, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:27:46,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=40620, skipped=0, lr=[6.167677349129735e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:27:46,953] [INFO] [timer.py:259:stop] epoch=0/micro_step=40620/global_step=40620, RunningAvgSamplesPerSec=2.633724363170585, CurrSamplesPerSec=2.6405363613151085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:28:02,223] [INFO] [logging.py:96:log_dist] [Rank 0] step=40630, skipped=0, lr=[6.166039010730079e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:28:02,228] [INFO] [timer.py:259:stop] epoch=0/micro_step=40630/global_step=40630, RunningAvgSamplesPerSec=2.633723919373159, CurrSamplesPerSec=2.6464718214833494, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:28:17,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=40640, skipped=0, lr=[6.164400539926314e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:28:17,500] [INFO] [timer.py:259:stop] epoch=0/micro_step=40640/global_step=40640, RunningAvgSamplesPerSec=2.6337237353309084, CurrSamplesPerSec=2.621731588043399, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:28:32,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=40650, skipped=0, lr=[6.162761936904494e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:28:32,800] [INFO] [timer.py:259:stop] epoch=0/micro_step=40650/global_step=40650, RunningAvgSamplesPerSec=2.633722225658253, CurrSamplesPerSec=2.6477135117567694, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:28:48,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=40660, skipped=0, lr=[6.161123201850679e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:28:48,164] [INFO] [timer.py:259:stop] epoch=0/micro_step=40660/global_step=40660, RunningAvgSamplesPerSec=2.633718312928666, CurrSamplesPerSec=2.64262426948019, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:29:03,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=40670, skipped=0, lr=[6.1594843349509505e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:29:03,461] [INFO] [timer.py:259:stop] epoch=0/micro_step=40670/global_step=40670, RunningAvgSamplesPerSec=2.633716824481245, CurrSamplesPerSec=2.6439281813679885, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:29:18,732] [INFO] [logging.py:96:log_dist] [Rank 0] step=40680, skipped=0, lr=[6.157845336391401e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:29:18,737] [INFO] [timer.py:259:stop] epoch=0/micro_step=40680/global_step=40680, RunningAvgSamplesPerSec=2.633715873835345, CurrSamplesPerSec=2.6238407346123656, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:29:34,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=40690, skipped=0, lr=[6.156206206358142e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:29:34,013] [INFO] [timer.py:259:stop] epoch=0/micro_step=40690/global_step=40690, RunningAvgSamplesPerSec=2.633715267720678, CurrSamplesPerSec=2.6329446540090133, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:29:49,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=40700, skipped=0, lr=[6.154566945037293e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:29:49,327] [INFO] [timer.py:259:stop] epoch=0/micro_step=40700/global_step=40700, RunningAvgSamplesPerSec=2.633713356502935, CurrSamplesPerSec=2.605791734368323, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:30:04,591] [INFO] [logging.py:96:log_dist] [Rank 0] step=40710, skipped=0, lr=[6.1529275526149976e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:30:04,598] [INFO] [timer.py:259:stop] epoch=0/micro_step=40710/global_step=40710, RunningAvgSamplesPerSec=2.6337131266432845, CurrSamplesPerSec=2.644390335526752, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:30:19,861] [INFO] [logging.py:96:log_dist] [Rank 0] step=40720, skipped=0, lr=[6.151288029277404e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:30:19,867] [INFO] [timer.py:259:stop] epoch=0/micro_step=40720/global_step=40720, RunningAvgSamplesPerSec=2.633713155860264, CurrSamplesPerSec=2.6282656081234532, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:30:35,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=40730, skipped=0, lr=[6.1496483752106836e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:30:35,114] [INFO] [timer.py:259:stop] epoch=0/micro_step=40730/global_step=40730, RunningAvgSamplesPerSec=2.633713856899983, CurrSamplesPerSec=2.6482911086668497, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:30:50,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=40740, skipped=0, lr=[6.148008590601019e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:30:50,430] [INFO] [timer.py:259:stop] epoch=0/micro_step=40740/global_step=40740, RunningAvgSamplesPerSec=2.633711579432852, CurrSamplesPerSec=2.6156421862057417, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:31:05,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=40750, skipped=0, lr=[6.146368675634608e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:31:05,711] [INFO] [timer.py:259:stop] epoch=0/micro_step=40750/global_step=40750, RunningAvgSamplesPerSec=2.6337104372054596, CurrSamplesPerSec=2.6297272590815455, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:31:20,997] [INFO] [logging.py:96:log_dist] [Rank 0] step=40760, skipped=0, lr=[6.144728630497666e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:31:21,012] [INFO] [timer.py:259:stop] epoch=0/micro_step=40760/global_step=40760, RunningAvgSamplesPerSec=2.633708630823228, CurrSamplesPerSec=2.6279280281493533, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:31:36,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=40770, skipped=0, lr=[6.143088455376417e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:31:36,326] [INFO] [timer.py:259:stop] epoch=0/micro_step=40770/global_step=40770, RunningAvgSamplesPerSec=2.633706370236871, CurrSamplesPerSec=2.63658669856984, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:31:51,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=40780, skipped=0, lr=[6.141448150457105e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:31:51,579] [INFO] [timer.py:259:stop] epoch=0/micro_step=40780/global_step=40780, RunningAvgSamplesPerSec=2.63370680942016, CurrSamplesPerSec=2.640628625374309, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:32:06,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=40790, skipped=0, lr=[6.1398077159259885e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:32:06,844] [INFO] [timer.py:259:stop] epoch=0/micro_step=40790/global_step=40790, RunningAvgSamplesPerSec=2.6337068562017985, CurrSamplesPerSec=2.620771623669338, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:32:22,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=40800, skipped=0, lr=[6.138167151969337e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:32:22,111] [INFO] [timer.py:259:stop] epoch=0/micro_step=40800/global_step=40800, RunningAvgSamplesPerSec=2.6337066950638026, CurrSamplesPerSec=2.643263362890388, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:32:37,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=40810, skipped=0, lr=[6.136526458773437e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:32:37,435] [INFO] [timer.py:259:stop] epoch=0/micro_step=40810/global_step=40810, RunningAvgSamplesPerSec=2.6337041805632264, CurrSamplesPerSec=2.639106262139638, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:32:52,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=40820, skipped=0, lr=[6.134885636524592e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:32:52,725] [INFO] [timer.py:259:stop] epoch=0/micro_step=40820/global_step=40820, RunningAvgSamplesPerSec=2.633702628439658, CurrSamplesPerSec=2.63871774894907, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:33:08,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=40830, skipped=0, lr=[6.133244685409116e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:33:08,033] [INFO] [timer.py:259:stop] epoch=0/micro_step=40830/global_step=40830, RunningAvgSamplesPerSec=2.63370162161524, CurrSamplesPerSec=2.617740317217711, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:33:23,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=40840, skipped=0, lr=[6.131603605613342e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:33:23,374] [INFO] [timer.py:259:stop] epoch=0/micro_step=40840/global_step=40840, RunningAvgSamplesPerSec=2.633698541341768, CurrSamplesPerSec=2.6422359682493184, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:33:38,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=40850, skipped=0, lr=[6.129962397323614e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:33:38,642] [INFO] [timer.py:259:stop] epoch=0/micro_step=40850/global_step=40850, RunningAvgSamplesPerSec=2.633698394350494, CurrSamplesPerSec=2.606443503815097, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:33:53,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=40860, skipped=0, lr=[6.128321060726289e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:33:53,953] [INFO] [timer.py:259:stop] epoch=0/micro_step=40860/global_step=40860, RunningAvgSamplesPerSec=2.6336963497669856, CurrSamplesPerSec=2.6412867132543485, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:34:09,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=40870, skipped=0, lr=[6.126679596007742e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:34:09,311] [INFO] [timer.py:259:stop] epoch=0/micro_step=40870/global_step=40870, RunningAvgSamplesPerSec=2.6336924703895943, CurrSamplesPerSec=2.5658017146398486, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:34:24,566] [INFO] [logging.py:96:log_dist] [Rank 0] step=40880, skipped=0, lr=[6.125038003354365e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:34:24,567] [INFO] [timer.py:259:stop] epoch=0/micro_step=40880/global_step=40880, RunningAvgSamplesPerSec=2.6336926444005093, CurrSamplesPerSec=2.6416610093063464, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:34:39,894] [INFO] [logging.py:96:log_dist] [Rank 0] step=40890, skipped=0, lr=[6.123396282952561e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:34:39,895] [INFO] [timer.py:259:stop] epoch=0/micro_step=40890/global_step=40890, RunningAvgSamplesPerSec=2.633690071567472, CurrSamplesPerSec=2.542957114185184, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:34:55,151] [INFO] [logging.py:96:log_dist] [Rank 0] step=40900, skipped=0, lr=[6.121754434988745e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:34:55,152] [INFO] [timer.py:259:stop] epoch=0/micro_step=40900/global_step=40900, RunningAvgSamplesPerSec=2.6336901722637283, CurrSamplesPerSec=2.6396921537464055, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:35:10,524] [INFO] [logging.py:96:log_dist] [Rank 0] step=40910, skipped=0, lr=[6.120112459649352e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:35:10,525] [INFO] [timer.py:259:stop] epoch=0/micro_step=40910/global_step=40910, RunningAvgSamplesPerSec=2.633685668513643, CurrSamplesPerSec=2.5894109311445166, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:35:25,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=40920, skipped=0, lr=[6.118470357120826e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:35:25,796] [INFO] [timer.py:259:stop] epoch=0/micro_step=40920/global_step=40920, RunningAvgSamplesPerSec=2.633685356871461, CurrSamplesPerSec=2.6217909946027964, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:35:41,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=40930, skipped=0, lr=[6.116828127589631e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:35:41,145] [INFO] [timer.py:259:stop] epoch=0/micro_step=40930/global_step=40930, RunningAvgSamplesPerSec=2.633681637078548, CurrSamplesPerSec=2.6503289188655423, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:35:56,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=40940, skipped=0, lr=[6.115185771242243e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:35:56,461] [INFO] [timer.py:259:stop] epoch=0/micro_step=40940/global_step=40940, RunningAvgSamplesPerSec=2.6336789160577747, CurrSamplesPerSec=2.628096807295836, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:36:11,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=40950, skipped=0, lr=[6.113543288265148e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:36:11,738] [INFO] [timer.py:259:stop] epoch=0/micro_step=40950/global_step=40950, RunningAvgSamplesPerSec=2.633678031205324, CurrSamplesPerSec=2.6186919306517855, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:36:27,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=40960, skipped=0, lr=[6.111900678844854e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:36:27,034] [INFO] [timer.py:259:stop] epoch=0/micro_step=40960/global_step=40960, RunningAvgSamplesPerSec=2.6336767546207804, CurrSamplesPerSec=2.6280259997459314, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:36:42,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=40970, skipped=0, lr=[6.110257943167881e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:36:42,474] [INFO] [timer.py:259:stop] epoch=0/micro_step=40970/global_step=40970, RunningAvgSamplesPerSec=2.6336696057079467, CurrSamplesPerSec=2.6159182893041257, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:36:57,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=40980, skipped=0, lr=[6.108615081420759e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:36:57,862] [INFO] [timer.py:259:stop] epoch=0/micro_step=40980/global_step=40980, RunningAvgSamplesPerSec=2.633664702774116, CurrSamplesPerSec=2.6270758188345718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:37:13,222] [INFO] [logging.py:96:log_dist] [Rank 0] step=40990, skipped=0, lr=[6.1069720937900365e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:37:13,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=40990/global_step=40990, RunningAvgSamplesPerSec=2.633661108429759, CurrSamplesPerSec=2.6489082705610283, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:37:28,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=41000, skipped=0, lr=[6.105328980462276e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:37:28,550] [INFO] [timer.py:259:stop] epoch=0/micro_step=41000/global_step=41000, RunningAvgSamplesPerSec=2.6336582379693754, CurrSamplesPerSec=2.614995590849534, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:37:43,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=41010, skipped=0, lr=[6.103685741624053e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:37:43,884] [INFO] [timer.py:259:stop] epoch=0/micro_step=41010/global_step=41010, RunningAvgSamplesPerSec=2.6336551707109823, CurrSamplesPerSec=2.6563421803357348, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:37:59,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=41020, skipped=0, lr=[6.1020423774619574e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:37:59,193] [INFO] [timer.py:259:stop] epoch=0/micro_step=41020/global_step=41020, RunningAvgSamplesPerSec=2.633653217760703, CurrSamplesPerSec=2.595333702801721, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:38:14,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=41030, skipped=0, lr=[6.100398888162594e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:38:14,483] [INFO] [timer.py:259:stop] epoch=0/micro_step=41030/global_step=41030, RunningAvgSamplesPerSec=2.633652124225884, CurrSamplesPerSec=2.626859048376014, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:38:29,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=41040, skipped=0, lr=[6.098755273912582e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:38:29,822] [INFO] [timer.py:259:stop] epoch=0/micro_step=41040/global_step=41040, RunningAvgSamplesPerSec=2.6336488721553226, CurrSamplesPerSec=2.5882548544531048, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:38:45,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=41050, skipped=0, lr=[6.0971115348985564e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:38:45,113] [INFO] [timer.py:259:stop] epoch=0/micro_step=41050/global_step=41050, RunningAvgSamplesPerSec=2.6336475094677274, CurrSamplesPerSec=2.625573146764733, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:39:00,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=41060, skipped=0, lr=[6.095467671307158e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:39:00,461] [INFO] [timer.py:259:stop] epoch=0/micro_step=41060/global_step=41060, RunningAvgSamplesPerSec=2.6336438474693336, CurrSamplesPerSec=2.619834044062642, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:39:15,807] [INFO] [logging.py:96:log_dist] [Rank 0] step=41070, skipped=0, lr=[6.093823683325055e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:39:15,822] [INFO] [timer.py:259:stop] epoch=0/micro_step=41070/global_step=41070, RunningAvgSamplesPerSec=2.633639719351815, CurrSamplesPerSec=2.6200537482689747, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:39:31,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=41080, skipped=0, lr=[6.092179571138919e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:39:31,183] [INFO] [timer.py:259:stop] epoch=0/micro_step=41080/global_step=41080, RunningAvgSamplesPerSec=2.6336358350109172, CurrSamplesPerSec=2.6173837935886013, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:39:46,454] [INFO] [logging.py:96:log_dist] [Rank 0] step=41090, skipped=0, lr=[6.090535334935439e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:39:46,456] [INFO] [timer.py:259:stop] epoch=0/micro_step=41090/global_step=41090, RunningAvgSamplesPerSec=2.6336355770655313, CurrSamplesPerSec=2.636203483284846, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:40:01,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=41100, skipped=0, lr=[6.088890974901322e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:40:01,884] [INFO] [timer.py:259:stop] epoch=0/micro_step=41100/global_step=41100, RunningAvgSamplesPerSec=2.633629077543446, CurrSamplesPerSec=2.6271963537490413, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:40:17,207] [INFO] [logging.py:96:log_dist] [Rank 0] step=41110, skipped=0, lr=[6.0872464912232805e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:40:17,212] [INFO] [timer.py:259:stop] epoch=0/micro_step=41110/global_step=41110, RunningAvgSamplesPerSec=2.633626316566518, CurrSamplesPerSec=2.63942886481292, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:40:32,523] [INFO] [logging.py:96:log_dist] [Rank 0] step=41120, skipped=0, lr=[6.085601884088052e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:40:32,524] [INFO] [timer.py:259:stop] epoch=0/micro_step=41120/global_step=41120, RunningAvgSamplesPerSec=2.6336244816907275, CurrSamplesPerSec=2.6403701360402705, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:40:47,831] [INFO] [logging.py:96:log_dist] [Rank 0] step=41130, skipped=0, lr=[6.083957153682377e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:40:47,833] [INFO] [timer.py:259:stop] epoch=0/micro_step=41130/global_step=41130, RunningAvgSamplesPerSec=2.633622778766884, CurrSamplesPerSec=2.6295215904308145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:41:03,186] [INFO] [logging.py:96:log_dist] [Rank 0] step=41140, skipped=0, lr=[6.082312300193019e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:41:03,187] [INFO] [timer.py:259:stop] epoch=0/micro_step=41140/global_step=41140, RunningAvgSamplesPerSec=2.6336188617197895, CurrSamplesPerSec=2.6425813967519307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:41:18,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=41150, skipped=0, lr=[6.080667323806749e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:41:18,521] [INFO] [timer.py:259:stop] epoch=0/micro_step=41150/global_step=41150, RunningAvgSamplesPerSec=2.6336155364068916, CurrSamplesPerSec=2.615667061643705, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:41:33,906] [INFO] [logging.py:96:log_dist] [Rank 0] step=41160, skipped=0, lr=[6.079022224710355e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:41:33,908] [INFO] [timer.py:259:stop] epoch=0/micro_step=41160/global_step=41160, RunningAvgSamplesPerSec=2.633611215919856, CurrSamplesPerSec=2.6293670509925575, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:41:49,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=41170, skipped=0, lr=[6.0773770030906395e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:41:49,313] [INFO] [timer.py:259:stop] epoch=0/micro_step=41170/global_step=41170, RunningAvgSamplesPerSec=2.6336050347275086, CurrSamplesPerSec=2.564464728434094, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:42:04,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=41180, skipped=0, lr=[6.07573165913442e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:42:04,658] [INFO] [timer.py:259:stop] epoch=0/micro_step=41180/global_step=41180, RunningAvgSamplesPerSec=2.6336024885380014, CurrSamplesPerSec=2.648524392156086, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:42:20,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=41190, skipped=0, lr=[6.074086193028522e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:42:20,010] [INFO] [timer.py:259:stop] epoch=0/micro_step=41190/global_step=41190, RunningAvgSamplesPerSec=2.633598660545683, CurrSamplesPerSec=2.5921646616543224, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:42:35,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=41200, skipped=0, lr=[6.072440604959789e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:42:35,337] [INFO] [timer.py:259:stop] epoch=0/micro_step=41200/global_step=41200, RunningAvgSamplesPerSec=2.6335955884059894, CurrSamplesPerSec=2.6404162614560143, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:42:50,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=41210, skipped=0, lr=[6.07079489511508e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:42:50,675] [INFO] [timer.py:259:stop] epoch=0/micro_step=41210/global_step=41210, RunningAvgSamplesPerSec=2.633592536223727, CurrSamplesPerSec=2.595970610157829, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:43:05,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=41220, skipped=0, lr=[6.069149063681264e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:43:05,986] [INFO] [timer.py:259:stop] epoch=0/micro_step=41220/global_step=41220, RunningAvgSamplesPerSec=2.633590436920205, CurrSamplesPerSec=2.6248838499472775, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:43:21,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=41230, skipped=0, lr=[6.067503110845228e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:43:21,425] [INFO] [timer.py:259:stop] epoch=0/micro_step=41230/global_step=41230, RunningAvgSamplesPerSec=2.633582585262157, CurrSamplesPerSec=2.5508367470798436, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:43:36,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=41240, skipped=0, lr=[6.06585703679387e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:43:36,713] [INFO] [timer.py:259:stop] epoch=0/micro_step=41240/global_step=41240, RunningAvgSamplesPerSec=2.633581617897854, CurrSamplesPerSec=2.629516232759662, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:43:52,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=41250, skipped=0, lr=[6.064210841714102e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:43:52,099] [INFO] [timer.py:259:stop] epoch=0/micro_step=41250/global_step=41250, RunningAvgSamplesPerSec=2.633576183749702, CurrSamplesPerSec=2.6330231649377955, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:44:07,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=41260, skipped=0, lr=[6.062564525792847e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:44:07,421] [INFO] [timer.py:259:stop] epoch=0/micro_step=41260/global_step=41260, RunningAvgSamplesPerSec=2.633573708803324, CurrSamplesPerSec=2.6350768544516123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:44:22,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=41270, skipped=0, lr=[6.0609180892170485e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:44:22,760] [INFO] [timer.py:259:stop] epoch=0/micro_step=41270/global_step=41270, RunningAvgSamplesPerSec=2.633569958689334, CurrSamplesPerSec=2.6115194113504034, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:44:38,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=41280, skipped=0, lr=[6.059271532173658e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:44:38,087] [INFO] [timer.py:259:stop] epoch=0/micro_step=41280/global_step=41280, RunningAvgSamplesPerSec=2.6335670744654425, CurrSamplesPerSec=2.6221724898301897, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:44:53,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=41290, skipped=0, lr=[6.057624854849644e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:44:53,444] [INFO] [timer.py:259:stop] epoch=0/micro_step=41290/global_step=41290, RunningAvgSamplesPerSec=2.633562915587989, CurrSamplesPerSec=2.6379327702040714, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:45:08,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=41300, skipped=0, lr=[6.0559780574319856e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:45:08,770] [INFO] [timer.py:259:stop] epoch=0/micro_step=41300/global_step=41300, RunningAvgSamplesPerSec=2.6335603119311317, CurrSamplesPerSec=2.574644550268037, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:45:24,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=41310, skipped=0, lr=[6.0543311401076796e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:45:24,083] [INFO] [timer.py:259:stop] epoch=0/micro_step=41310/global_step=41310, RunningAvgSamplesPerSec=2.6335581401888013, CurrSamplesPerSec=2.629261150454123, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:45:39,523] [INFO] [logging.py:96:log_dist] [Rank 0] step=41320, skipped=0, lr=[6.052684103063731e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:45:39,544] [INFO] [timer.py:259:stop] epoch=0/micro_step=41320/global_step=41320, RunningAvgSamplesPerSec=2.6335498422990185, CurrSamplesPerSec=2.574705398175591, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:45:54,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=41330, skipped=0, lr=[6.0510369464871635e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:45:54,900] [INFO] [timer.py:259:stop] epoch=0/micro_step=41330/global_step=41330, RunningAvgSamplesPerSec=2.633546203719276, CurrSamplesPerSec=2.626817096935294, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:46:10,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=41340, skipped=0, lr=[6.049389670565011e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:46:10,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=41340/global_step=41340, RunningAvgSamplesPerSec=2.63354269793341, CurrSamplesPerSec=2.558411847457106, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:46:25,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=41350, skipped=0, lr=[6.047742275484323e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:46:25,565] [INFO] [timer.py:259:stop] epoch=0/micro_step=41350/global_step=41350, RunningAvgSamplesPerSec=2.6335406311295277, CurrSamplesPerSec=2.5777562176613684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:46:40,876] [INFO] [logging.py:96:log_dist] [Rank 0] step=41360, skipped=0, lr=[6.04609476143216e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:46:40,878] [INFO] [timer.py:259:stop] epoch=0/micro_step=41360/global_step=41360, RunningAvgSamplesPerSec=2.633538906421239, CurrSamplesPerSec=2.6162527906236557, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:46:56,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=41370, skipped=0, lr=[6.0444471285956005e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:46:56,246] [INFO] [timer.py:259:stop] epoch=0/micro_step=41370/global_step=41370, RunningAvgSamplesPerSec=2.6335341030597283, CurrSamplesPerSec=2.617747669242329, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:47:11,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=41380, skipped=0, lr=[6.042799377161733e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:47:11,544] [INFO] [timer.py:259:stop] epoch=0/micro_step=41380/global_step=41380, RunningAvgSamplesPerSec=2.6335324715850783, CurrSamplesPerSec=2.6336109059443724, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:47:26,794] [INFO] [logging.py:96:log_dist] [Rank 0] step=41390, skipped=0, lr=[6.0411515073176605e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:47:26,801] [INFO] [timer.py:259:stop] epoch=0/micro_step=41390/global_step=41390, RunningAvgSamplesPerSec=2.633532656277532, CurrSamplesPerSec=2.624577931306933, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:47:42,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=41400, skipped=0, lr=[6.039503519250496e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:47:42,179] [INFO] [timer.py:259:stop] epoch=0/micro_step=41400/global_step=41400, RunningAvgSamplesPerSec=2.633528083095988, CurrSamplesPerSec=2.632199446605589, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:47:57,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=41410, skipped=0, lr=[6.037855413147373e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:47:57,465] [INFO] [timer.py:259:stop] epoch=0/micro_step=41410/global_step=41410, RunningAvgSamplesPerSec=2.6335269262719563, CurrSamplesPerSec=2.6333389087867065, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:48:12,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=41420, skipped=0, lr=[6.036207189195432e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:48:12,827] [INFO] [timer.py:259:stop] epoch=0/micro_step=41420/global_step=41420, RunningAvgSamplesPerSec=2.6335228668703308, CurrSamplesPerSec=2.628618513065667, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:48:28,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=41430, skipped=0, lr=[6.0345588475818304e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:48:28,168] [INFO] [timer.py:259:stop] epoch=0/micro_step=41430/global_step=41430, RunningAvgSamplesPerSec=2.633520072202481, CurrSamplesPerSec=2.6296967570895493, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:48:43,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=41440, skipped=0, lr=[6.032910388493739e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:48:43,554] [INFO] [timer.py:259:stop] epoch=0/micro_step=41440/global_step=41440, RunningAvgSamplesPerSec=2.633514703963127, CurrSamplesPerSec=2.6354411126868307, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:48:58,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=41450, skipped=0, lr=[6.031261812118338e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:48:58,876] [INFO] [timer.py:259:stop] epoch=0/micro_step=41450/global_step=41450, RunningAvgSamplesPerSec=2.633511973628391, CurrSamplesPerSec=2.5934096206872406, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:49:14,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=41460, skipped=0, lr=[6.029613118642828e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:49:14,218] [INFO] [timer.py:259:stop] epoch=0/micro_step=41460/global_step=41460, RunningAvgSamplesPerSec=2.6335086764654334, CurrSamplesPerSec=2.625166015312945, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:49:29,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=41470, skipped=0, lr=[6.027964308254413e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:49:29,616] [INFO] [timer.py:259:stop] epoch=0/micro_step=41470/global_step=41470, RunningAvgSamplesPerSec=2.633503124862571, CurrSamplesPerSec=2.57743029938718, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:49:44,925] [INFO] [logging.py:96:log_dist] [Rank 0] step=41480, skipped=0, lr=[6.026315381140321e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:49:44,946] [INFO] [timer.py:259:stop] epoch=0/micro_step=41480/global_step=41480, RunningAvgSamplesPerSec=2.6335001791950394, CurrSamplesPerSec=2.653938642133081, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:50:00,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=41490, skipped=0, lr=[6.024666337487785e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:50:00,309] [INFO] [timer.py:259:stop] epoch=0/micro_step=41490/global_step=41490, RunningAvgSamplesPerSec=2.633495476799681, CurrSamplesPerSec=2.5987737413756684, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:50:15,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=41500, skipped=0, lr=[6.023017177484056e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:50:15,573] [INFO] [timer.py:259:stop] epoch=0/micro_step=41500/global_step=41500, RunningAvgSamplesPerSec=2.6334953240939556, CurrSamplesPerSec=2.6343304397689535, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:50:30,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=41510, skipped=0, lr=[6.0213679013163964e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:50:30,888] [INFO] [timer.py:259:stop] epoch=0/micro_step=41510/global_step=41510, RunningAvgSamplesPerSec=2.633492999547106, CurrSamplesPerSec=2.631673841754505, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:50:46,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=41520, skipped=0, lr=[6.0197185091720835e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:50:46,161] [INFO] [timer.py:259:stop] epoch=0/micro_step=41520/global_step=41520, RunningAvgSamplesPerSec=2.6334921519290484, CurrSamplesPerSec=2.629623390331075, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:51:01,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=41530, skipped=0, lr=[6.018069001238404e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:51:01,523] [INFO] [timer.py:259:stop] epoch=0/micro_step=41530/global_step=41530, RunningAvgSamplesPerSec=2.633488027051153, CurrSamplesPerSec=2.5997116126551716, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:51:16,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=41540, skipped=0, lr=[6.01641937770266e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:51:16,850] [INFO] [timer.py:259:stop] epoch=0/micro_step=41540/global_step=41540, RunningAvgSamplesPerSec=2.63348599192428, CurrSamplesPerSec=2.6180152293451706, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:51:32,171] [INFO] [logging.py:96:log_dist] [Rank 0] step=41550, skipped=0, lr=[6.014769638752167e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:51:32,187] [INFO] [timer.py:259:stop] epoch=0/micro_step=41550/global_step=41550, RunningAvgSamplesPerSec=2.6334828930094654, CurrSamplesPerSec=2.6216717745036027, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:51:47,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=41560, skipped=0, lr=[6.013119784574255e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:51:47,540] [INFO] [timer.py:259:stop] epoch=0/micro_step=41560/global_step=41560, RunningAvgSamplesPerSec=2.633479222117903, CurrSamplesPerSec=2.6309058371118197, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:52:02,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=41570, skipped=0, lr=[6.011469815356266e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:52:02,866] [INFO] [timer.py:259:stop] epoch=0/micro_step=41570/global_step=41570, RunningAvgSamplesPerSec=2.6334761295307842, CurrSamplesPerSec=2.6273345917032263, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:52:18,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=41580, skipped=0, lr=[6.009819731285551e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:52:18,210] [INFO] [timer.py:259:stop] epoch=0/micro_step=41580/global_step=41580, RunningAvgSamplesPerSec=2.6334728462793464, CurrSamplesPerSec=2.649124094208349, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:52:33,601] [INFO] [logging.py:96:log_dist] [Rank 0] step=41590, skipped=0, lr=[6.008169532549482e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:52:33,603] [INFO] [timer.py:259:stop] epoch=0/micro_step=41590/global_step=41590, RunningAvgSamplesPerSec=2.6334674554247854, CurrSamplesPerSec=2.595536467256295, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:52:48,926] [INFO] [logging.py:96:log_dist] [Rank 0] step=41600, skipped=0, lr=[6.006519219335434e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:52:48,927] [INFO] [timer.py:259:stop] epoch=0/micro_step=41600/global_step=41600, RunningAvgSamplesPerSec=2.633464735987012, CurrSamplesPerSec=2.620246480060352, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:53:04,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=41610, skipped=0, lr=[6.004868791830805e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:53:04,250] [INFO] [timer.py:259:stop] epoch=0/micro_step=41610/global_step=41610, RunningAvgSamplesPerSec=2.633462610003656, CurrSamplesPerSec=2.6200856636767864, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:53:19,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=41620, skipped=0, lr=[6.003218250223002e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:53:19,576] [INFO] [timer.py:259:stop] epoch=0/micro_step=41620/global_step=41620, RunningAvgSamplesPerSec=2.6334602392393403, CurrSamplesPerSec=2.6057116013792085, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:53:34,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=41630, skipped=0, lr=[6.001567594699441e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:53:34,921] [INFO] [timer.py:259:stop] epoch=0/micro_step=41630/global_step=41630, RunningAvgSamplesPerSec=2.6334569095508336, CurrSamplesPerSec=2.631463328393081, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:53:50,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=41640, skipped=0, lr=[5.999916825447558e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:53:50,252] [INFO] [timer.py:259:stop] epoch=0/micro_step=41640/global_step=41640, RunningAvgSamplesPerSec=2.6334533612475246, CurrSamplesPerSec=2.5994232126304087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:54:05,544] [INFO] [logging.py:96:log_dist] [Rank 0] step=41650, skipped=0, lr=[5.998265942654795e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:54:05,545] [INFO] [timer.py:259:stop] epoch=0/micro_step=41650/global_step=41650, RunningAvgSamplesPerSec=2.633452232728409, CurrSamplesPerSec=2.6220614308927566, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:54:20,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=41660, skipped=0, lr=[5.996614946508613e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:54:20,941] [INFO] [timer.py:259:stop] epoch=0/micro_step=41660/global_step=41660, RunningAvgSamplesPerSec=2.6334470519385893, CurrSamplesPerSec=2.601774167052145, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:54:36,251] [INFO] [logging.py:96:log_dist] [Rank 0] step=41670, skipped=0, lr=[5.994963837196482e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:54:36,272] [INFO] [timer.py:259:stop] epoch=0/micro_step=41670/global_step=41670, RunningAvgSamplesPerSec=2.633444184473328, CurrSamplesPerSec=2.6518160580900267, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:54:51,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=41680, skipped=0, lr=[5.9933126149058874e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:54:51,643] [INFO] [timer.py:259:stop] epoch=0/micro_step=41680/global_step=41680, RunningAvgSamplesPerSec=2.633439804173017, CurrSamplesPerSec=2.6342055269659657, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:55:06,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=41690, skipped=0, lr=[5.9916612798243236e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:55:06,965] [INFO] [timer.py:259:stop] epoch=0/micro_step=41690/global_step=41690, RunningAvgSamplesPerSec=2.6334372276353735, CurrSamplesPerSec=2.6312829741273087, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:55:22,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=41700, skipped=0, lr=[5.990009832139301e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:55:22,309] [INFO] [timer.py:259:stop] epoch=0/micro_step=41700/global_step=41700, RunningAvgSamplesPerSec=2.6334339375836375, CurrSamplesPerSec=2.6262056628572124, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +[2024-11-01 17:55:37,575] [INFO] [logging.py:96:log_dist] [Rank 0] step=41710, skipped=0, lr=[5.988358272038343e-06], mom=[(0.9, 0.95)] +[2024-11-01 17:55:37,577] [INFO] [timer.py:259:stop] epoch=0/micro_step=41710/global_step=41710, RunningAvgSamplesPerSec=2.633433630895238, CurrSamplesPerSec=2.632644289629314, MemAllocated=16.43GB, MaxMemAllocated=37.96GB +Saving final model...Saving final model... + +Saving final model in PyTorch bin format... +wandb: 🚀 View run devilish-veil-470 at: https://wandb.ai/organization-chanho/Shared Memory/runs/agjc19ic +wandb: Find logs at: wandb/run-20241101_001357-agjc19ic/logs +[2024-11-01 17:55:54,949] [INFO] [launch.py:351:main] Process 2252703 exits successfully. +wandb: 🚀 View run sinister-spell-471 at: https://wandb.ai/organization-chanho/Shared Memory/runs/aws5932s +wandb: Find logs at: wandb/run-20241101_001357-aws5932s/logs +[2024-11-01 17:56:23,956] [INFO] [launch.py:351:main] Process 2252702 exits successfully.