BrownianNotion commited on
Commit
93cf0ea
·
verified ·
1 Parent(s): 38f35b1

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20250408_142458-wlfced8t/run-wlfced8t.wandb filter=lfs diff=lfs merge=lfs -text
metrics.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "PPL": 16.7308349609375,
3
+ "arc_challenge": {
4
+ "acc": 0.2150170648464164,
5
+ "acc_stderr": 0.012005717634133614,
6
+ "acc_norm": 0.26109215017064846,
7
+ "acc_norm_stderr": 0.012835523909473867
8
+ },
9
+ "arc_easy": {
10
+ "acc": 0.37836700336700335,
11
+ "acc_stderr": 0.00995157568333195,
12
+ "acc_norm": 0.3611111111111111,
13
+ "acc_norm_stderr": 0.009856013425811242
14
+ },
15
+ "hellaswag": {
16
+ "acc": 0.33519219279028084,
17
+ "acc_stderr": 0.00471092856998574,
18
+ "acc_norm": 0.391256721768572,
19
+ "acc_norm_stderr": 0.004870342592915051
20
+ },
21
+ "piqa": {
22
+ "acc": 0.6142546245919478,
23
+ "acc_stderr": 0.011357166777524042,
24
+ "acc_norm": 0.6196953210010882,
25
+ "acc_norm_stderr": 0.011326620892570317
26
+ },
27
+ "winogrande": {
28
+ "acc": 0.4988161010260458,
29
+ "acc_stderr": 0.014052446290529019
30
+ },
31
+ "QA Avg": 0.4083293973243388
32
+ }
runs/events.out.tfevents.1744122340.816cf5acb821 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9d2dfc3ed992e856282258efd7e8191ae0bd3e81fd476fcf1e436f0fee2237e
3
+ size 70125
wandb/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:24:58.122459682Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_142458-wlfced8t/logs/debug-core.log"}
2
+ {"time":"2025-04-08T14:24:58.380784982Z","level":"INFO","msg":"created new stream","id":"wlfced8t"}
3
+ {"time":"2025-04-08T14:24:58.380854043Z","level":"INFO","msg":"stream: started","id":"wlfced8t"}
4
+ {"time":"2025-04-08T14:24:58.380871954Z","level":"INFO","msg":"writer: Do: started","stream_id":"wlfced8t"}
5
+ {"time":"2025-04-08T14:24:58.380921165Z","level":"INFO","msg":"sender: started","stream_id":"wlfced8t"}
6
+ {"time":"2025-04-08T14:24:58.380954676Z","level":"INFO","msg":"handler: started","stream_id":"wlfced8t"}
7
+ {"time":"2025-04-08T14:24:58.59144642Z","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-04-08T15:51:25.126053606Z","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-04-08T15:51:25.126119508Z","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-04-08T15:51:25.715478158Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2025-04-08T15:51:25.950965444Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2025-04-08T15:51:25.956097933Z","level":"INFO","msg":"stream: closing","id":"wlfced8t"}
13
+ {"time":"2025-04-08T15:51:25.956113153Z","level":"INFO","msg":"handler: closed","stream_id":"wlfced8t"}
14
+ {"time":"2025-04-08T15:51:25.956122893Z","level":"INFO","msg":"writer: Close: closed","stream_id":"wlfced8t"}
15
+ {"time":"2025-04-08T15:51:25.956130383Z","level":"INFO","msg":"sender: closed","stream_id":"wlfced8t"}
16
+ {"time":"2025-04-08T15:51:25.956237656Z","level":"INFO","msg":"stream: closed","id":"wlfced8t"}
wandb/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9
2
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Configure stats pid to 9420
3
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings
4
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Loading settings from /workspace/BitDistiller/train/wandb/settings
5
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_init.py:setup_run_log_directory():662] Logging user logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_142458-wlfced8t/logs/debug.log
7
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_142458-wlfced8t/logs/debug-internal.log
8
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():781] calling init triggers
9
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():786] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():809] starting backend
12
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():813] sending inform_init request
13
+ 2025-04-08 14:24:58,119 INFO MainThread:9420 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-08 14:24:58,119 INFO MainThread:9420 [wandb_init.py:init():823] backend started and connected
15
+ 2025-04-08 14:24:58,124 INFO MainThread:9420 [wandb_init.py:init():915] updated telemetry
16
+ 2025-04-08 14:24:58,304 INFO MainThread:9420 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout
17
+ 2025-04-08 14:24:58,589 INFO MainThread:9420 [wandb_init.py:init():1014] starting run threads in backend
18
+ 2025-04-08 14:24:58,680 INFO MainThread:9420 [wandb_run.py:_console_start():2454] atexit reg
19
+ 2025-04-08 14:24:58,681 INFO MainThread:9420 [wandb_run.py:_redirect():2306] redirect: wrap_raw
20
+ 2025-04-08 14:24:58,681 INFO MainThread:9420 [wandb_run.py:_redirect():2371] Wrapping output streams.
21
+ 2025-04-08 14:24:58,681 INFO MainThread:9420 [wandb_run.py:_redirect():2394] Redirects installed.
22
+ 2025-04-08 14:24:58,682 INFO MainThread:9420 [wandb_init.py:init():1056] run started, returning control to user process
23
+ 2025-04-08 14:25:40,038 INFO MainThread:9420 [wandb_run.py:_config_callback():1327] config_cb None None {'vocab_size': 32001, 'max_position_embeddings': 2048, 'hidden_size': 2048, 'intermediate_size': 5632, 'num_hidden_layers': 22, 'num_attention_heads': 32, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '../models/TinyLlama_v1.1/', 'transformers_version': '4.37.0', 'model_type': 'llama', 'output_dir': './ckpts/tinyllama_v1.1/int2-g128/', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 4.0, 'max_steps': -1, 'lr_scheduler_type': 'constant', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './ckpts/tinyllama_v1.1/int2-g128/runs/', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 40, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 40, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './ckpts/tinyllama_v1.1/int2-g128/', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': 'config/zero.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'cache_dir': None, 'model_max_length': 1024, 'bits': 2, 'q_group_size': 128, 'quant_type': 'int2-asym', 'clip': '../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt', 'train_kd': True, 'kd_tmp': 1, 'kd_loss_type': 'cakld', 'cakld_steps': 10}
24
+ 2025-04-08 15:51:25,124 INFO MainThread:9420 [wandb_run.py:_finish():2189] finishing run DeepFriedNLP/SNLP_BitDistiller/wlfced8t
25
+ 2025-04-08 15:51:25,125 INFO MainThread:9420 [wandb_run.py:_atexit_cleanup():2419] got exitcode: 0
26
+ 2025-04-08 15:51:25,125 INFO MainThread:9420 [wandb_run.py:_restore():2401] restore
27
+ 2025-04-08 15:51:25,125 INFO MainThread:9420 [wandb_run.py:_restore():2407] restore done
28
+ 2025-04-08 15:51:25,953 INFO MainThread:9420 [wandb_run.py:_footer_history_summary_info():4064] rendering history
29
+ 2025-04-08 15:51:25,954 INFO MainThread:9420 [wandb_run.py:_footer_history_summary_info():4096] rendering summary
30
+ 2025-04-08 15:51:25,955 INFO MainThread:9420 [wandb_run.py:_footer_sync_info():4025] logging synced files
wandb/run-20250408_141746-8llb18c8/files/output.log ADDED
File without changes
wandb/run-20250408_141746-8llb18c8/logs/debug-core.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:17:45.949818845Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpp31dfewy/port-6976.txt","pid":6976,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-04-08T14:17:45.966044774Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":6976}
3
+ {"time":"2025-04-08T14:17:45.966061155Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":46761,"Zone":""}}
4
+ {"time":"2025-04-08T14:17:46.138422649Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:53806"}
5
+ {"time":"2025-04-08T14:17:46.530704428Z","level":"INFO","msg":"handleInformInit: received","streamId":"8llb18c8","id":"127.0.0.1:53806"}
6
+ {"time":"2025-04-08T14:17:46.790763608Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"8llb18c8","id":"127.0.0.1:53806"}
7
+ {"time":"2025-04-08T14:17:47.911560658Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:53806"}
8
+ {"time":"2025-04-08T14:17:47.91163659Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:53806"}
9
+ {"time":"2025-04-08T14:17:47.911668421Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2025-04-08T14:17:47.911737362Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:53806"}
11
+ {"time":"2025-04-08T14:17:48.092584004Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:53806"}
12
+ {"time":"2025-04-08T14:17:48.092971643Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:53806"}
13
+ {"time":"2025-04-08T14:17:48.092993873Z","level":"INFO","msg":"server is closed"}
wandb/run-20250408_141746-8llb18c8/logs/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:17:46.530930213Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_141746-8llb18c8/logs/debug-core.log"}
2
+ {"time":"2025-04-08T14:17:46.790699927Z","level":"INFO","msg":"created new stream","id":"8llb18c8"}
3
+ {"time":"2025-04-08T14:17:46.790754738Z","level":"INFO","msg":"stream: started","id":"8llb18c8"}
4
+ {"time":"2025-04-08T14:17:46.790793839Z","level":"INFO","msg":"writer: Do: started","stream_id":"8llb18c8"}
5
+ {"time":"2025-04-08T14:17:46.790821339Z","level":"INFO","msg":"sender: started","stream_id":"8llb18c8"}
6
+ {"time":"2025-04-08T14:17:46.790876061Z","level":"INFO","msg":"handler: started","stream_id":"8llb18c8"}
7
+ {"time":"2025-04-08T14:17:46.909228673Z","level":"ERROR","msg":"HTTP error","status":401,"method":"POST","url":"https://api.wandb.ai/graphql"}
8
+ {"time":"2025-04-08T14:17:46.909315955Z","level":"ERROR","msg":"sender: upsertRun:","error":"failed to upsert bucket: returned error 401: {\"data\":{\"upsertBucket\":null},\"errors\":[{\"message\":\"user is not logged in\",\"path\":[\"upsertBucket\"],\"extensions\":{\"code\":\"PERMISSION_ERROR\"}}]}"}
9
+ {"time":"2025-04-08T14:17:47.911709251Z","level":"INFO","msg":"stream: closing","id":"8llb18c8"}
10
+ {"time":"2025-04-08T14:17:47.911812024Z","level":"ERROR","msg":"sender: upsertConfig: RunRecord is nil"}
11
+ {"time":"2025-04-08T14:17:48.090388056Z","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"}
12
+ {"time":"2025-04-08T14:17:48.090496798Z","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404: {\"data\":{\"createRunFiles\":null},\"errors\":[{\"message\":\"run SNLP_BitDistiller/8llb18c8 not found during createRunFiles\",\"path\":[\"createRunFiles\"]}]}"}
13
+ {"time":"2025-04-08T14:17:48.091068171Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
14
+ {"time":"2025-04-08T14:17:48.091107982Z","level":"INFO","msg":"handler: closed","stream_id":"8llb18c8"}
15
+ {"time":"2025-04-08T14:17:48.091119232Z","level":"INFO","msg":"writer: Close: closed","stream_id":"8llb18c8"}
16
+ {"time":"2025-04-08T14:17:48.091141302Z","level":"INFO","msg":"sender: closed","stream_id":"8llb18c8"}
17
+ {"time":"2025-04-08T14:17:48.091168643Z","level":"INFO","msg":"stream: closed","id":"8llb18c8"}
wandb/run-20250408_141746-8llb18c8/logs/debug.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9
2
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_setup.py:_flush():67] Configure stats pid to 6976
3
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings
4
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_setup.py:_flush():67] Loading settings from /workspace/BitDistiller/train/wandb/settings
5
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_init.py:setup_run_log_directory():662] Logging user logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_141746-8llb18c8/logs/debug.log
7
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_141746-8llb18c8/logs/debug-internal.log
8
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_init.py:init():781] calling init triggers
9
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_init.py:init():786] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_init.py:init():809] starting backend
12
+ 2025-04-08 14:17:46,525 INFO MainThread:6976 [wandb_init.py:init():813] sending inform_init request
13
+ 2025-04-08 14:17:46,528 INFO MainThread:6976 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-08 14:17:46,528 INFO MainThread:6976 [wandb_init.py:init():823] backend started and connected
15
+ 2025-04-08 14:17:46,533 INFO MainThread:6976 [wandb_init.py:init():915] updated telemetry
16
+ 2025-04-08 14:17:46,716 INFO MainThread:6976 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout
wandb/run-20250408_141746-8llb18c8/run-8llb18c8.wandb ADDED
Binary file (637 Bytes). View file
 
wandb/run-20250408_142458-wlfced8t/files/config.yaml ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: ../models/TinyLlama_v1.1/
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.19.9
6
+ m:
7
+ - "1": eval/runtime
8
+ "5": 2
9
+ "6":
10
+ - 1
11
+ - 3
12
+ "7": []
13
+ - "1": train/global_step
14
+ "6":
15
+ - 3
16
+ "7": []
17
+ - "1": eval/samples_per_second
18
+ "5": 2
19
+ "6":
20
+ - 1
21
+ - 3
22
+ "7": []
23
+ - "1": eval/steps_per_second
24
+ "5": 2
25
+ "6":
26
+ - 1
27
+ - 3
28
+ "7": []
29
+ - "1": train/train_steps_per_second
30
+ "5": 2
31
+ "6":
32
+ - 1
33
+ - 3
34
+ "7": []
35
+ - "1": train/train_runtime
36
+ "5": 2
37
+ "6":
38
+ - 1
39
+ - 3
40
+ "7": []
41
+ - "1": train/train_samples_per_second
42
+ "5": 2
43
+ "6":
44
+ - 1
45
+ - 3
46
+ "7": []
47
+ - "1": train/loss
48
+ "5": 2
49
+ "6":
50
+ - 1
51
+ - 3
52
+ "7": []
53
+ - "1": train/epoch
54
+ "5": 2
55
+ "6":
56
+ - 1
57
+ - 3
58
+ "7": []
59
+ - "1": eval/loss
60
+ "5": 2
61
+ "6":
62
+ - 1
63
+ - 3
64
+ "7": []
65
+ - "1": train/total_flos
66
+ "5": 2
67
+ "6":
68
+ - 1
69
+ - 3
70
+ "7": []
71
+ - "1": train/train_loss
72
+ "5": 2
73
+ "6":
74
+ - 1
75
+ - 3
76
+ "7": []
77
+ - "1": train/learning_rate
78
+ "5": 2
79
+ "6":
80
+ - 1
81
+ - 3
82
+ "7": []
83
+ python_version: 3.9.21
84
+ t:
85
+ "1":
86
+ - 1
87
+ - 5
88
+ - 11
89
+ - 49
90
+ - 51
91
+ - 53
92
+ - 55
93
+ - 71
94
+ - 98
95
+ "2":
96
+ - 1
97
+ - 5
98
+ - 11
99
+ - 49
100
+ - 51
101
+ - 53
102
+ - 55
103
+ - 71
104
+ - 98
105
+ "3":
106
+ - 2
107
+ - 7
108
+ - 13
109
+ - 15
110
+ - 23
111
+ - 55
112
+ - 66
113
+ "4": 3.9.21
114
+ "5": 0.19.9
115
+ "6": 4.37.0
116
+ "8":
117
+ - 5
118
+ "9":
119
+ "1": transformers_trainer
120
+ "12": 0.19.9
121
+ "13": linux-x86_64
122
+ adafactor:
123
+ value: false
124
+ adam_beta1:
125
+ value: 0.9
126
+ adam_beta2:
127
+ value: 0.999
128
+ adam_epsilon:
129
+ value: 1e-08
130
+ add_cross_attention:
131
+ value: false
132
+ architectures:
133
+ value:
134
+ - LlamaForCausalLM
135
+ attention_bias:
136
+ value: false
137
+ attention_dropout:
138
+ value: 0
139
+ auto_find_batch_size:
140
+ value: false
141
+ bad_words_ids:
142
+ value: null
143
+ begin_suppress_tokens:
144
+ value: null
145
+ bf16:
146
+ value: true
147
+ bf16_full_eval:
148
+ value: false
149
+ bits:
150
+ value: 2
151
+ bos_token_id:
152
+ value: 1
153
+ cache_dir:
154
+ value: null
155
+ cakld_steps:
156
+ value: 10
157
+ chunk_size_feed_forward:
158
+ value: 0
159
+ clip:
160
+ value: ../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt
161
+ cross_attention_hidden_size:
162
+ value: null
163
+ data_seed:
164
+ value: null
165
+ dataloader_drop_last:
166
+ value: false
167
+ dataloader_num_workers:
168
+ value: 0
169
+ dataloader_persistent_workers:
170
+ value: false
171
+ dataloader_pin_memory:
172
+ value: true
173
+ ddp_backend:
174
+ value: null
175
+ ddp_broadcast_buffers:
176
+ value: null
177
+ ddp_bucket_cap_mb:
178
+ value: null
179
+ ddp_find_unused_parameters:
180
+ value: null
181
+ ddp_timeout:
182
+ value: 1800
183
+ debug:
184
+ value: []
185
+ decoder_start_token_id:
186
+ value: null
187
+ deepspeed:
188
+ value: config/zero.json
189
+ disable_tqdm:
190
+ value: false
191
+ dispatch_batches:
192
+ value: null
193
+ diversity_penalty:
194
+ value: 0
195
+ do_eval:
196
+ value: true
197
+ do_predict:
198
+ value: false
199
+ do_sample:
200
+ value: false
201
+ do_train:
202
+ value: false
203
+ early_stopping:
204
+ value: false
205
+ encoder_no_repeat_ngram_size:
206
+ value: 0
207
+ eos_token_id:
208
+ value: 2
209
+ eval_accumulation_steps:
210
+ value: null
211
+ eval_delay:
212
+ value: 0
213
+ eval_steps:
214
+ value: 40
215
+ evaluation_strategy:
216
+ value: steps
217
+ exponential_decay_length_penalty:
218
+ value: null
219
+ finetuning_task:
220
+ value: null
221
+ forced_bos_token_id:
222
+ value: null
223
+ forced_eos_token_id:
224
+ value: null
225
+ fp16:
226
+ value: false
227
+ fp16_backend:
228
+ value: auto
229
+ fp16_full_eval:
230
+ value: false
231
+ fp16_opt_level:
232
+ value: O1
233
+ fsdp:
234
+ value: []
235
+ fsdp_config:
236
+ value:
237
+ min_num_params: 0
238
+ xla: false
239
+ xla_fsdp_grad_ckpt: false
240
+ fsdp_min_num_params:
241
+ value: 0
242
+ fsdp_transformer_layer_cls_to_wrap:
243
+ value: null
244
+ full_determinism:
245
+ value: false
246
+ gradient_accumulation_steps:
247
+ value: 4
248
+ gradient_checkpointing:
249
+ value: true
250
+ gradient_checkpointing_kwargs:
251
+ value: null
252
+ greater_is_better:
253
+ value: false
254
+ group_by_length:
255
+ value: false
256
+ half_precision_backend:
257
+ value: auto
258
+ hidden_act:
259
+ value: silu
260
+ hidden_size:
261
+ value: 2048
262
+ hub_always_push:
263
+ value: false
264
+ hub_model_id:
265
+ value: null
266
+ hub_private_repo:
267
+ value: false
268
+ hub_strategy:
269
+ value: every_save
270
+ hub_token:
271
+ value: <HUB_TOKEN>
272
+ id2label:
273
+ value:
274
+ "0": LABEL_0
275
+ "1": LABEL_1
276
+ ignore_data_skip:
277
+ value: false
278
+ include_inputs_for_metrics:
279
+ value: false
280
+ include_num_input_tokens_seen:
281
+ value: false
282
+ include_tokens_per_second:
283
+ value: false
284
+ initializer_range:
285
+ value: 0.02
286
+ intermediate_size:
287
+ value: 5632
288
+ is_decoder:
289
+ value: false
290
+ is_encoder_decoder:
291
+ value: false
292
+ jit_mode_eval:
293
+ value: false
294
+ kd_loss_type:
295
+ value: cakld
296
+ kd_tmp:
297
+ value: 1
298
+ label_names:
299
+ value: null
300
+ label_smoothing_factor:
301
+ value: 0
302
+ label2id:
303
+ value:
304
+ LABEL_0: 0
305
+ LABEL_1: 1
306
+ learning_rate:
307
+ value: 2e-05
308
+ length_column_name:
309
+ value: length
310
+ length_penalty:
311
+ value: 1
312
+ load_best_model_at_end:
313
+ value: true
314
+ local_rank:
315
+ value: 0
316
+ log_level:
317
+ value: passive
318
+ log_level_replica:
319
+ value: warning
320
+ log_on_each_node:
321
+ value: true
322
+ logging_dir:
323
+ value: ./ckpts/tinyllama_v1.1/int2-g128/runs/
324
+ logging_first_step:
325
+ value: false
326
+ logging_nan_inf_filter:
327
+ value: true
328
+ logging_steps:
329
+ value: 1
330
+ logging_strategy:
331
+ value: steps
332
+ lr_scheduler_type:
333
+ value: constant
334
+ max_grad_norm:
335
+ value: 1
336
+ max_length:
337
+ value: 20
338
+ max_position_embeddings:
339
+ value: 2048
340
+ max_steps:
341
+ value: -1
342
+ metric_for_best_model:
343
+ value: loss
344
+ min_length:
345
+ value: 0
346
+ model_max_length:
347
+ value: 1024
348
+ model_type:
349
+ value: llama
350
+ mp_parameters:
351
+ value: ""
352
+ neftune_noise_alpha:
353
+ value: null
354
+ no_cuda:
355
+ value: false
356
+ no_repeat_ngram_size:
357
+ value: 0
358
+ num_attention_heads:
359
+ value: 32
360
+ num_beam_groups:
361
+ value: 1
362
+ num_beams:
363
+ value: 1
364
+ num_hidden_layers:
365
+ value: 22
366
+ num_key_value_heads:
367
+ value: 4
368
+ num_return_sequences:
369
+ value: 1
370
+ num_train_epochs:
371
+ value: 4
372
+ optim:
373
+ value: adamw_torch
374
+ optim_args:
375
+ value: null
376
+ output_attentions:
377
+ value: false
378
+ output_dir:
379
+ value: ./ckpts/tinyllama_v1.1/int2-g128/
380
+ output_hidden_states:
381
+ value: false
382
+ output_scores:
383
+ value: false
384
+ overwrite_output_dir:
385
+ value: false
386
+ pad_token_id:
387
+ value: null
388
+ past_index:
389
+ value: -1
390
+ per_device_eval_batch_size:
391
+ value: 16
392
+ per_device_train_batch_size:
393
+ value: 16
394
+ per_gpu_eval_batch_size:
395
+ value: null
396
+ per_gpu_train_batch_size:
397
+ value: null
398
+ prediction_loss_only:
399
+ value: false
400
+ prefix:
401
+ value: null
402
+ pretraining_tp:
403
+ value: 1
404
+ problem_type:
405
+ value: null
406
+ push_to_hub:
407
+ value: false
408
+ push_to_hub_model_id:
409
+ value: null
410
+ push_to_hub_organization:
411
+ value: null
412
+ push_to_hub_token:
413
+ value: <PUSH_TO_HUB_TOKEN>
414
+ q_group_size:
415
+ value: 128
416
+ quant_type:
417
+ value: int2-asym
418
+ ray_scope:
419
+ value: last
420
+ remove_invalid_values:
421
+ value: false
422
+ remove_unused_columns:
423
+ value: true
424
+ repetition_penalty:
425
+ value: 1
426
+ report_to:
427
+ value:
428
+ - tensorboard
429
+ - wandb
430
+ resume_from_checkpoint:
431
+ value: null
432
+ return_dict:
433
+ value: true
434
+ return_dict_in_generate:
435
+ value: false
436
+ rms_norm_eps:
437
+ value: 1e-05
438
+ rope_scaling:
439
+ value: null
440
+ rope_theta:
441
+ value: 10000
442
+ run_name:
443
+ value: ./ckpts/tinyllama_v1.1/int2-g128/
444
+ save_on_each_node:
445
+ value: false
446
+ save_only_model:
447
+ value: false
448
+ save_safetensors:
449
+ value: true
450
+ save_steps:
451
+ value: 40
452
+ save_strategy:
453
+ value: steps
454
+ save_total_limit:
455
+ value: 2
456
+ seed:
457
+ value: 42
458
+ sep_token_id:
459
+ value: null
460
+ skip_memory_metrics:
461
+ value: true
462
+ split_batches:
463
+ value: false
464
+ suppress_tokens:
465
+ value: null
466
+ task_specific_params:
467
+ value: null
468
+ temperature:
469
+ value: 1
470
+ tf_legacy_loss:
471
+ value: false
472
+ tf32:
473
+ value: null
474
+ tie_encoder_decoder:
475
+ value: false
476
+ tie_word_embeddings:
477
+ value: false
478
+ tokenizer_class:
479
+ value: null
480
+ top_k:
481
+ value: 50
482
+ top_p:
483
+ value: 1
484
+ torch_compile:
485
+ value: false
486
+ torch_compile_backend:
487
+ value: null
488
+ torch_compile_mode:
489
+ value: null
490
+ torch_dtype:
491
+ value: bfloat16
492
+ torchdynamo:
493
+ value: null
494
+ torchscript:
495
+ value: false
496
+ tpu_metrics_debug:
497
+ value: false
498
+ tpu_num_cores:
499
+ value: null
500
+ train_kd:
501
+ value: true
502
+ transformers_version:
503
+ value: 4.37.0
504
+ typical_p:
505
+ value: 1
506
+ use_bfloat16:
507
+ value: false
508
+ use_cache:
509
+ value: true
510
+ use_cpu:
511
+ value: false
512
+ use_ipex:
513
+ value: false
514
+ use_legacy_prediction_loop:
515
+ value: false
516
+ use_mps_device:
517
+ value: false
518
+ vocab_size:
519
+ value: 32001
520
+ warmup_ratio:
521
+ value: 0
522
+ warmup_steps:
523
+ value: 0
524
+ weight_decay:
525
+ value: 0
wandb/run-20250408_142458-wlfced8t/files/output.log ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /workspace/BitDistiller/BitDistillerVenv/lib/python3.9/site-packages/accelerate/accelerator.py:432: FutureWarning: Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches']). Please pass an `accelerate.DataLoaderConfiguration` instead:
2
+ dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
3
+ warnings.warn(
4
+ Using /root/.cache/torch_extensions/py39_cu124 as PyTorch extensions root...
5
+ Creating extension directory /root/.cache/torch_extensions/py39_cu124/cpu_adam...
6
+ Emitting ninja build file /root/.cache/torch_extensions/py39_cu124/cpu_adam/build.ninja...
7
+ Building extension module cpu_adam...
8
+ Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
9
+ Loading extension module cpu_adam...
10
+ Time to load cpu_adam op: 23.6294584274292 seconds
11
+ [2025-04-08 14:25:40,026] [WARNING] [lr_schedules.py:683:get_lr] Attempting to get learning rate from scheduler before it has started
12
+ 0%| | 0/400 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
13
+ /workspace/BitDistiller/BitDistillerVenv/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
14
+ return fn(*args, **kwargs)
15
+
16
+ {'loss': 5217.7002, 'learning_rate': 0.0, 'epoch': 0.01}
17
+ {'loss': 5011.7588, 'learning_rate': 2e-05, 'epoch': 0.02}
18
+ {'loss': 4812.8281, 'learning_rate': 2e-05, 'epoch': 0.03}
19
+ {'loss': 1860.1274, 'learning_rate': 2e-05, 'epoch': 0.04}
20
+ {'loss': 1713.5734, 'learning_rate': 2e-05, 'epoch': 0.05}
21
+ {'loss': 1103.2999, 'learning_rate': 2e-05, 'epoch': 0.06}
22
+ {'loss': 675.6068, 'learning_rate': 2e-05, 'epoch': 0.07}
23
+ {'loss': 683.2965, 'learning_rate': 2e-05, 'epoch': 0.08}
24
+ {'loss': 734.9794, 'learning_rate': 2e-05, 'epoch': 0.09}
25
+ {'loss': 585.422, 'learning_rate': 2e-05, 'epoch': 0.1}
26
+ {'loss': 570.1306, 'learning_rate': 2e-05, 'epoch': 0.11}
27
+ {'loss': 599.2966, 'learning_rate': 2e-05, 'epoch': 0.12}
28
+ {'loss': 574.2372, 'learning_rate': 2e-05, 'epoch': 0.13}
29
+ {'loss': 481.7502, 'learning_rate': 2e-05, 'epoch': 0.14}
30
+ {'loss': 531.738, 'learning_rate': 2e-05, 'epoch': 0.15}
31
+ {'loss': 448.2928, 'learning_rate': 2e-05, 'epoch': 0.16}
32
+ {'loss': 490.8231, 'learning_rate': 2e-05, 'epoch': 0.17}
33
+ {'loss': 448.1083, 'learning_rate': 2e-05, 'epoch': 0.18}
34
+ {'loss': 506.6299, 'learning_rate': 2e-05, 'epoch': 0.19}
35
+ {'loss': 432.8912, 'learning_rate': 2e-05, 'epoch': 0.2}
36
+ {'loss': 449.625, 'learning_rate': 2e-05, 'epoch': 0.21}
37
+ {'loss': 588.6897, 'learning_rate': 2e-05, 'epoch': 0.22}
38
+ {'loss': 405.68, 'learning_rate': 2e-05, 'epoch': 0.23}
39
+ {'loss': 516.0303, 'learning_rate': 2e-05, 'epoch': 0.24}
40
+ {'loss': 401.7353, 'learning_rate': 2e-05, 'epoch': 0.25}
41
+ {'loss': 399.9238, 'learning_rate': 2e-05, 'epoch': 0.26}
42
+ {'loss': 351.5388, 'learning_rate': 2e-05, 'epoch': 0.27}
43
+ {'loss': 401.7846, 'learning_rate': 2e-05, 'epoch': 0.28}
44
+ {'loss': 385.5488, 'learning_rate': 2e-05, 'epoch': 0.29}
45
+ {'loss': 378.391, 'learning_rate': 2e-05, 'epoch': 0.3}
46
+ {'loss': 446.2048, 'learning_rate': 2e-05, 'epoch': 0.31}
47
+ {'loss': 354.5841, 'learning_rate': 2e-05, 'epoch': 0.32}
48
+ {'loss': 370.7014, 'learning_rate': 2e-05, 'epoch': 0.33}
49
+ {'loss': 397.8215, 'learning_rate': 2e-05, 'epoch': 0.34}
50
+ {'loss': 400.33, 'learning_rate': 2e-05, 'epoch': 0.35}
51
+ {'loss': 379.8638, 'learning_rate': 2e-05, 'epoch': 0.36}
52
+ {'loss': 282.1729, 'learning_rate': 2e-05, 'epoch': 0.37}
53
+ {'loss': 288.713, 'learning_rate': 2e-05, 'epoch': 0.38}
54
+ {'loss': 362.3285, 'learning_rate': 2e-05, 'epoch': 0.39}
55
+ {'loss': 345.252, 'learning_rate': 2e-05, 'epoch': 0.4}
56
+ return fn(*args, **kwargs)
57
+ {'eval_loss': 349.94061279296875, 'eval_runtime': 91.4064, 'eval_samples_per_second': 17.504, 'eval_steps_per_second': 1.094, 'epoch': 0.4}
58
+
59
+ {'loss': 391.5078, 'learning_rate': 2e-05, 'epoch': 0.41}
60
+ {'loss': 332.1484, 'learning_rate': 2e-05, 'epoch': 0.42}
61
+ {'loss': 352.9743, 'learning_rate': 2e-05, 'epoch': 0.43}
62
+ {'loss': 314.8037, 'learning_rate': 2e-05, 'epoch': 0.44}
63
+ {'loss': 386.3977, 'learning_rate': 2e-05, 'epoch': 0.45}
64
+ {'loss': 359.9244, 'learning_rate': 2e-05, 'epoch': 0.46}
65
+ {'loss': 376.9478, 'learning_rate': 2e-05, 'epoch': 0.47}
66
+ {'loss': 307.694, 'learning_rate': 2e-05, 'epoch': 0.48}
67
+ {'loss': 359.4525, 'learning_rate': 2e-05, 'epoch': 0.49}
68
+ {'loss': 319.51, 'learning_rate': 2e-05, 'epoch': 0.5}
69
+ {'loss': 349.2659, 'learning_rate': 2e-05, 'epoch': 0.51}
70
+ {'loss': 332.9238, 'learning_rate': 2e-05, 'epoch': 0.52}
71
+ {'loss': 324.871, 'learning_rate': 2e-05, 'epoch': 0.53}
72
+ {'loss': 305.993, 'learning_rate': 2e-05, 'epoch': 0.54}
73
+ {'loss': 334.1832, 'learning_rate': 2e-05, 'epoch': 0.55}
74
+ {'loss': 393.5037, 'learning_rate': 2e-05, 'epoch': 0.56}
75
+ {'loss': 453.1027, 'learning_rate': 2e-05, 'epoch': 0.57}
76
+ {'loss': 306.5744, 'learning_rate': 2e-05, 'epoch': 0.58}
77
+ {'loss': 343.3282, 'learning_rate': 2e-05, 'epoch': 0.59}
78
+ {'loss': 367.3992, 'learning_rate': 2e-05, 'epoch': 0.6}
79
+ {'loss': 252.5841, 'learning_rate': 2e-05, 'epoch': 0.61}
80
+ {'loss': 332.6815, 'learning_rate': 2e-05, 'epoch': 0.62}
81
+ {'loss': 260.7815, 'learning_rate': 2e-05, 'epoch': 0.63}
82
+ {'loss': 322.11, 'learning_rate': 2e-05, 'epoch': 0.64}
83
+ {'loss': 316.3943, 'learning_rate': 2e-05, 'epoch': 0.65}
84
+ {'loss': 282.9459, 'learning_rate': 2e-05, 'epoch': 0.66}
85
+ {'loss': 389.4407, 'learning_rate': 2e-05, 'epoch': 0.67}
86
+ {'loss': 266.971, 'learning_rate': 2e-05, 'epoch': 0.68}
87
+ {'loss': 309.2886, 'learning_rate': 2e-05, 'epoch': 0.69}
88
+ {'loss': 248.2584, 'learning_rate': 2e-05, 'epoch': 0.7}
89
+ {'loss': 311.3903, 'learning_rate': 2e-05, 'epoch': 0.71}
90
+ {'loss': 281.4355, 'learning_rate': 2e-05, 'epoch': 0.72}
91
+ {'loss': 348.8126, 'learning_rate': 2e-05, 'epoch': 0.73}
92
+ {'loss': 335.4451, 'learning_rate': 2e-05, 'epoch': 0.74}
93
+ {'loss': 252.2587, 'learning_rate': 2e-05, 'epoch': 0.75}
94
+ {'loss': 280.6372, 'learning_rate': 2e-05, 'epoch': 0.76}
95
+ {'loss': 276.1567, 'learning_rate': 2e-05, 'epoch': 0.77}
96
+ {'loss': 295.8489, 'learning_rate': 2e-05, 'epoch': 0.78}
97
+ {'loss': 258.2193, 'learning_rate': 2e-05, 'epoch': 0.79}
98
+ {'loss': 303.292, 'learning_rate': 2e-05, 'epoch': 0.8}
99
+ return fn(*args, **kwargs)
100
+ {'eval_loss': 290.92047119140625, 'eval_runtime': 91.4002, 'eval_samples_per_second': 17.505, 'eval_steps_per_second': 1.094, 'epoch': 0.8}
101
+
102
+ {'loss': 292.2969, 'learning_rate': 2e-05, 'epoch': 0.81}
103
+ {'loss': 292.5847, 'learning_rate': 2e-05, 'epoch': 0.82}
104
+ {'loss': 287.2512, 'learning_rate': 2e-05, 'epoch': 0.83}
105
+ {'loss': 274.7219, 'learning_rate': 2e-05, 'epoch': 0.84}
106
+ {'loss': 259.4895, 'learning_rate': 2e-05, 'epoch': 0.85}
107
+ {'loss': 308.7735, 'learning_rate': 2e-05, 'epoch': 0.86}
108
+ {'loss': 358.5125, 'learning_rate': 2e-05, 'epoch': 0.87}
109
+ {'loss': 228.2059, 'learning_rate': 2e-05, 'epoch': 0.88}
110
+ {'loss': 221.5, 'learning_rate': 2e-05, 'epoch': 0.89}
111
+ {'loss': 256.0698, 'learning_rate': 2e-05, 'epoch': 0.9}
112
+ {'loss': 280.6437, 'learning_rate': 2e-05, 'epoch': 0.91}
113
+ {'loss': 265.0312, 'learning_rate': 2e-05, 'epoch': 0.92}
114
+ {'loss': 276.1062, 'learning_rate': 2e-05, 'epoch': 0.93}
115
+ {'loss': 332.605, 'learning_rate': 2e-05, 'epoch': 0.94}
116
+ {'loss': 241.6841, 'learning_rate': 2e-05, 'epoch': 0.95}
117
+ {'loss': 290.7553, 'learning_rate': 2e-05, 'epoch': 0.96}
118
+ {'loss': 242.8853, 'learning_rate': 2e-05, 'epoch': 0.97}
119
+ {'loss': 267.5484, 'learning_rate': 2e-05, 'epoch': 0.98}
120
+ {'loss': 263.2162, 'learning_rate': 2e-05, 'epoch': 0.99}
121
+ {'loss': 226.8806, 'learning_rate': 2e-05, 'epoch': 1.0}
122
+ {'loss': 175.9632, 'learning_rate': 2e-05, 'epoch': 1.01}
123
+ {'loss': 293.8885, 'learning_rate': 2e-05, 'epoch': 1.02}
124
+ {'loss': 264.4063, 'learning_rate': 2e-05, 'epoch': 1.03}
125
+ {'loss': 268.8441, 'learning_rate': 2e-05, 'epoch': 1.04}
126
+ {'loss': 251.1409, 'learning_rate': 2e-05, 'epoch': 1.05}
127
+ {'loss': 222.4796, 'learning_rate': 2e-05, 'epoch': 1.06}
128
+ {'loss': 259.7393, 'learning_rate': 2e-05, 'epoch': 1.07}
129
+ {'loss': 247.0995, 'learning_rate': 2e-05, 'epoch': 1.08}
130
+ {'loss': 228.6188, 'learning_rate': 2e-05, 'epoch': 1.09}
131
+ {'loss': 238.4029, 'learning_rate': 2e-05, 'epoch': 1.1}
132
+ {'loss': 249.7835, 'learning_rate': 2e-05, 'epoch': 1.11}
133
+ {'loss': 255.0745, 'learning_rate': 2e-05, 'epoch': 1.12}
134
+ {'loss': 281.3386, 'learning_rate': 2e-05, 'epoch': 1.13}
135
+ {'loss': 258.1128, 'learning_rate': 2e-05, 'epoch': 1.14}
136
+ {'loss': 258.487, 'learning_rate': 2e-05, 'epoch': 1.15}
137
+ {'loss': 252.1913, 'learning_rate': 2e-05, 'epoch': 1.16}
138
+ {'loss': 222.6366, 'learning_rate': 2e-05, 'epoch': 1.17}
139
+ {'loss': 247.7612, 'learning_rate': 2e-05, 'epoch': 1.18}
140
+ {'loss': 212.2664, 'learning_rate': 2e-05, 'epoch': 1.19}
141
+ {'loss': 260.1885, 'learning_rate': 2e-05, 'epoch': 1.2}
142
+ return fn(*args, **kwargs)
143
+ {'eval_loss': 268.6912841796875, 'eval_runtime': 91.4506, 'eval_samples_per_second': 17.496, 'eval_steps_per_second': 1.093, 'epoch': 1.2}
144
+
145
+ {'loss': 225.2303, 'learning_rate': 2e-05, 'epoch': 1.21}
146
+ {'loss': 208.434, 'learning_rate': 2e-05, 'epoch': 1.22}
147
+ {'loss': 260.5778, 'learning_rate': 2e-05, 'epoch': 1.23}
148
+ {'loss': 256.1859, 'learning_rate': 2e-05, 'epoch': 1.24}
149
+ {'loss': 225.3799, 'learning_rate': 2e-05, 'epoch': 1.25}
150
+ {'loss': 242.6659, 'learning_rate': 2e-05, 'epoch': 1.26}
151
+ {'loss': 218.2521, 'learning_rate': 2e-05, 'epoch': 1.27}
152
+ {'loss': 237.711, 'learning_rate': 2e-05, 'epoch': 1.28}
153
+ {'loss': 228.6392, 'learning_rate': 2e-05, 'epoch': 1.29}
154
+ {'loss': 257.0567, 'learning_rate': 2e-05, 'epoch': 1.3}
155
+ {'loss': 225.4318, 'learning_rate': 2e-05, 'epoch': 1.31}
156
+ {'loss': 255.6358, 'learning_rate': 2e-05, 'epoch': 1.32}
157
+ {'loss': 243.6262, 'learning_rate': 2e-05, 'epoch': 1.33}
158
+ {'loss': 235.9305, 'learning_rate': 2e-05, 'epoch': 1.34}
159
+ {'loss': 238.0324, 'learning_rate': 2e-05, 'epoch': 1.35}
160
+ {'loss': 239.2688, 'learning_rate': 2e-05, 'epoch': 1.36}
161
+ {'loss': 234.8799, 'learning_rate': 2e-05, 'epoch': 1.37}
162
+ {'loss': 249.6847, 'learning_rate': 2e-05, 'epoch': 1.38}
163
+ {'loss': 259.0303, 'learning_rate': 2e-05, 'epoch': 1.39}
164
+ {'loss': 230.0663, 'learning_rate': 2e-05, 'epoch': 1.4}
165
+ {'loss': 312.8887, 'learning_rate': 2e-05, 'epoch': 1.41}
166
+ {'loss': 214.6919, 'learning_rate': 2e-05, 'epoch': 1.42}
167
+ {'loss': 204.0403, 'learning_rate': 2e-05, 'epoch': 1.43}
168
+ {'loss': 219.8406, 'learning_rate': 2e-05, 'epoch': 1.44}
169
+ {'loss': 229.476, 'learning_rate': 2e-05, 'epoch': 1.45}
170
+ {'loss': 222.8145, 'learning_rate': 2e-05, 'epoch': 1.46}
171
+ {'loss': 257.3806, 'learning_rate': 2e-05, 'epoch': 1.47}
172
+ {'loss': 206.661, 'learning_rate': 2e-05, 'epoch': 1.48}
173
+ {'loss': 244.2539, 'learning_rate': 2e-05, 'epoch': 1.49}
174
+ {'loss': 219.9999, 'learning_rate': 2e-05, 'epoch': 1.5}
175
+ {'loss': 186.9665, 'learning_rate': 2e-05, 'epoch': 1.51}
176
+ {'loss': 246.9571, 'learning_rate': 2e-05, 'epoch': 1.52}
177
+ {'loss': 296.5907, 'learning_rate': 2e-05, 'epoch': 1.53}
178
+ {'loss': 235.987, 'learning_rate': 2e-05, 'epoch': 1.54}
179
+ {'loss': 232.2841, 'learning_rate': 2e-05, 'epoch': 1.55}
180
+ {'loss': 257.2687, 'learning_rate': 2e-05, 'epoch': 1.56}
181
+ {'loss': 229.2959, 'learning_rate': 2e-05, 'epoch': 1.57}
182
+ {'loss': 204.7547, 'learning_rate': 2e-05, 'epoch': 1.58}
183
+ {'loss': 229.0461, 'learning_rate': 2e-05, 'epoch': 1.59}
184
+ {'loss': 208.6121, 'learning_rate': 2e-05, 'epoch': 1.6}
185
+ return fn(*args, **kwargs)
186
+ {'eval_loss': 251.8457794189453, 'eval_runtime': 91.4542, 'eval_samples_per_second': 17.495, 'eval_steps_per_second': 1.093, 'epoch': 1.6}
187
+
188
+ {'loss': 213.7581, 'learning_rate': 2e-05, 'epoch': 1.61}
189
+ {'loss': 250.1387, 'learning_rate': 2e-05, 'epoch': 1.62}
190
+ {'loss': 216.0783, 'learning_rate': 2e-05, 'epoch': 1.63}
191
+ {'loss': 223.1108, 'learning_rate': 2e-05, 'epoch': 1.64}
192
+ {'loss': 223.4337, 'learning_rate': 2e-05, 'epoch': 1.65}
193
+ {'loss': 216.0298, 'learning_rate': 2e-05, 'epoch': 1.66}
194
+ {'loss': 210.1397, 'learning_rate': 2e-05, 'epoch': 1.67}
195
+ {'loss': 255.6102, 'learning_rate': 2e-05, 'epoch': 1.68}
196
+ {'loss': 206.8196, 'learning_rate': 2e-05, 'epoch': 1.69}
197
+ {'loss': 225.3016, 'learning_rate': 2e-05, 'epoch': 1.7}
198
+ {'loss': 204.423, 'learning_rate': 2e-05, 'epoch': 1.71}
199
+ {'loss': 200.3793, 'learning_rate': 2e-05, 'epoch': 1.72}
200
+ {'loss': 254.3165, 'learning_rate': 2e-05, 'epoch': 1.73}
201
+ {'loss': 228.116, 'learning_rate': 2e-05, 'epoch': 1.74}
202
+ {'loss': 215.9781, 'learning_rate': 2e-05, 'epoch': 1.75}
203
+ {'loss': 240.427, 'learning_rate': 2e-05, 'epoch': 1.76}
204
+ {'loss': 285.4974, 'learning_rate': 2e-05, 'epoch': 1.77}
205
+ {'loss': 241.3725, 'learning_rate': 2e-05, 'epoch': 1.78}
206
+ {'loss': 208.2607, 'learning_rate': 2e-05, 'epoch': 1.79}
207
+ {'loss': 189.7236, 'learning_rate': 2e-05, 'epoch': 1.8}
208
+ {'loss': 251.2979, 'learning_rate': 2e-05, 'epoch': 1.81}
209
+ {'loss': 221.9034, 'learning_rate': 2e-05, 'epoch': 1.82}
210
+ {'loss': 212.9315, 'learning_rate': 2e-05, 'epoch': 1.83}
211
+ {'loss': 269.8028, 'learning_rate': 2e-05, 'epoch': 1.84}
212
+ {'loss': 234.7929, 'learning_rate': 2e-05, 'epoch': 1.85}
213
+ {'loss': 232.632, 'learning_rate': 2e-05, 'epoch': 1.86}
214
+ {'loss': 234.5318, 'learning_rate': 2e-05, 'epoch': 1.87}
215
+ {'loss': 217.2522, 'learning_rate': 2e-05, 'epoch': 1.88}
216
+ {'loss': 212.185, 'learning_rate': 2e-05, 'epoch': 1.89}
217
+ {'loss': 193.634, 'learning_rate': 2e-05, 'epoch': 1.9}
218
+ {'loss': 194.4389, 'learning_rate': 2e-05, 'epoch': 1.91}
219
+ {'loss': 239.7684, 'learning_rate': 2e-05, 'epoch': 1.92}
220
+ {'loss': 214.6728, 'learning_rate': 2e-05, 'epoch': 1.93}
221
+ {'loss': 238.8546, 'learning_rate': 2e-05, 'epoch': 1.94}
222
+ {'loss': 252.4766, 'learning_rate': 2e-05, 'epoch': 1.95}
223
+ {'loss': 213.9979, 'learning_rate': 2e-05, 'epoch': 1.96}
224
+ {'loss': 226.8183, 'learning_rate': 2e-05, 'epoch': 1.97}
225
+ {'loss': 178.3002, 'learning_rate': 2e-05, 'epoch': 1.98}
226
+ {'loss': 226.1548, 'learning_rate': 2e-05, 'epoch': 1.99}
227
+ {'loss': 197.3228, 'learning_rate': 2e-05, 'epoch': 2.0}
228
+ return fn(*args, **kwargs)
229
+ {'eval_loss': 234.64894104003906, 'eval_runtime': 91.475, 'eval_samples_per_second': 17.491, 'eval_steps_per_second': 1.093, 'epoch': 2.0}
230
+
231
+ {'loss': 177.7395, 'learning_rate': 2e-05, 'epoch': 2.01}
232
+ {'loss': 207.6397, 'learning_rate': 2e-05, 'epoch': 2.02}
233
+ {'loss': 234.7, 'learning_rate': 2e-05, 'epoch': 2.03}
234
+ {'loss': 161.9427, 'learning_rate': 2e-05, 'epoch': 2.04}
235
+ {'loss': 152.5065, 'learning_rate': 2e-05, 'epoch': 2.05}
236
+ {'loss': 209.7177, 'learning_rate': 2e-05, 'epoch': 2.06}
237
+ {'loss': 180.4935, 'learning_rate': 2e-05, 'epoch': 2.07}
238
+ {'loss': 189.8789, 'learning_rate': 2e-05, 'epoch': 2.08}
239
+ {'loss': 179.2588, 'learning_rate': 2e-05, 'epoch': 2.09}
240
+ {'loss': 199.4266, 'learning_rate': 2e-05, 'epoch': 2.1}
241
+ {'loss': 217.2014, 'learning_rate': 2e-05, 'epoch': 2.11}
242
+ {'loss': 194.8955, 'learning_rate': 2e-05, 'epoch': 2.12}
243
+ {'loss': 199.1684, 'learning_rate': 2e-05, 'epoch': 2.13}
244
+ {'loss': 184.8403, 'learning_rate': 2e-05, 'epoch': 2.14}
245
+ {'loss': 181.7052, 'learning_rate': 2e-05, 'epoch': 2.15}
246
+ {'loss': 178.4005, 'learning_rate': 2e-05, 'epoch': 2.16}
247
+ {'loss': 166.0668, 'learning_rate': 2e-05, 'epoch': 2.17}
248
+ {'loss': 200.5868, 'learning_rate': 2e-05, 'epoch': 2.18}
249
+ {'loss': 164.0996, 'learning_rate': 2e-05, 'epoch': 2.19}
250
+ {'loss': 215.2086, 'learning_rate': 2e-05, 'epoch': 2.2}
251
+ {'loss': 182.4766, 'learning_rate': 2e-05, 'epoch': 2.21}
252
+ {'loss': 265.9225, 'learning_rate': 2e-05, 'epoch': 2.22}
253
+ {'loss': 170.2314, 'learning_rate': 2e-05, 'epoch': 2.23}
254
+ {'loss': 168.8915, 'learning_rate': 2e-05, 'epoch': 2.24}
255
+ {'loss': 202.6422, 'learning_rate': 2e-05, 'epoch': 2.25}
256
+ {'loss': 193.6124, 'learning_rate': 2e-05, 'epoch': 2.26}
257
+ {'loss': 191.7759, 'learning_rate': 2e-05, 'epoch': 2.27}
258
+ {'loss': 172.5128, 'learning_rate': 2e-05, 'epoch': 2.28}
259
+ {'loss': 174.589, 'learning_rate': 2e-05, 'epoch': 2.29}
260
+ {'loss': 190.2146, 'learning_rate': 2e-05, 'epoch': 2.3}
261
+ {'loss': 206.5455, 'learning_rate': 2e-05, 'epoch': 2.31}
262
+ {'loss': 212.3613, 'learning_rate': 2e-05, 'epoch': 2.32}
263
+ {'loss': 196.8155, 'learning_rate': 2e-05, 'epoch': 2.33}
264
+ {'loss': 175.7169, 'learning_rate': 2e-05, 'epoch': 2.34}
265
+ {'loss': 246.1433, 'learning_rate': 2e-05, 'epoch': 2.35}
266
+ {'loss': 273.7065, 'learning_rate': 2e-05, 'epoch': 2.36}
267
+ {'loss': 158.33, 'learning_rate': 2e-05, 'epoch': 2.37}
268
+ {'loss': 159.6902, 'learning_rate': 2e-05, 'epoch': 2.38}
269
+ {'loss': 260.6693, 'learning_rate': 2e-05, 'epoch': 2.39}
270
+ {'loss': 191.4345, 'learning_rate': 2e-05, 'epoch': 2.4}
271
+ return fn(*args, **kwargs)
272
+ {'eval_loss': 228.75088500976562, 'eval_runtime': 91.7222, 'eval_samples_per_second': 17.444, 'eval_steps_per_second': 1.09, 'epoch': 2.4}
273
+
274
+ {'loss': 194.6978, 'learning_rate': 2e-05, 'epoch': 2.41}
275
+ {'loss': 170.113, 'learning_rate': 2e-05, 'epoch': 2.42}
276
+ {'loss': 206.9311, 'learning_rate': 2e-05, 'epoch': 2.43}
277
+ {'loss': 195.2319, 'learning_rate': 2e-05, 'epoch': 2.44}
278
+ {'loss': 163.4541, 'learning_rate': 2e-05, 'epoch': 2.45}
279
+ {'loss': 194.2114, 'learning_rate': 2e-05, 'epoch': 2.46}
280
+ {'loss': 204.1492, 'learning_rate': 2e-05, 'epoch': 2.47}
281
+ {'loss': 202.1168, 'learning_rate': 2e-05, 'epoch': 2.48}
282
+ {'loss': 188.9232, 'learning_rate': 2e-05, 'epoch': 2.49}
283
+ {'loss': 183.1904, 'learning_rate': 2e-05, 'epoch': 2.5}
284
+ {'loss': 171.6944, 'learning_rate': 2e-05, 'epoch': 2.51}
285
+ {'loss': 218.1628, 'learning_rate': 2e-05, 'epoch': 2.52}
286
+ {'loss': 178.1614, 'learning_rate': 2e-05, 'epoch': 2.53}
287
+ {'loss': 175.8137, 'learning_rate': 2e-05, 'epoch': 2.54}
288
+ {'loss': 176.3016, 'learning_rate': 2e-05, 'epoch': 2.55}
289
+ {'loss': 195.611, 'learning_rate': 2e-05, 'epoch': 2.56}
290
+ {'loss': 154.6473, 'learning_rate': 2e-05, 'epoch': 2.57}
291
+ {'loss': 175.7625, 'learning_rate': 2e-05, 'epoch': 2.58}
292
+ {'loss': 180.9702, 'learning_rate': 2e-05, 'epoch': 2.59}
293
+ {'loss': 172.006, 'learning_rate': 2e-05, 'epoch': 2.6}
294
+ {'loss': 166.616, 'learning_rate': 2e-05, 'epoch': 2.61}
295
+ {'loss': 205.9087, 'learning_rate': 2e-05, 'epoch': 2.62}
296
+ {'loss': 195.5401, 'learning_rate': 2e-05, 'epoch': 2.63}
297
+ {'loss': 182.7327, 'learning_rate': 2e-05, 'epoch': 2.64}
298
+ {'loss': 187.6268, 'learning_rate': 2e-05, 'epoch': 2.65}
299
+ {'loss': 150.9506, 'learning_rate': 2e-05, 'epoch': 2.66}
300
+ {'loss': 187.1612, 'learning_rate': 2e-05, 'epoch': 2.67}
301
+ {'loss': 199.4861, 'learning_rate': 2e-05, 'epoch': 2.68}
302
+ {'loss': 197.6736, 'learning_rate': 2e-05, 'epoch': 2.69}
303
+ {'loss': 204.7334, 'learning_rate': 2e-05, 'epoch': 2.7}
304
+ {'loss': 186.2923, 'learning_rate': 2e-05, 'epoch': 2.71}
305
+ {'loss': 191.4558, 'learning_rate': 2e-05, 'epoch': 2.72}
306
+ {'loss': 195.1405, 'learning_rate': 2e-05, 'epoch': 2.73}
307
+ {'loss': 193.3551, 'learning_rate': 2e-05, 'epoch': 2.74}
308
+ {'loss': 191.0934, 'learning_rate': 2e-05, 'epoch': 2.75}
309
+ {'loss': 181.389, 'learning_rate': 2e-05, 'epoch': 2.76}
310
+ {'loss': 175.3716, 'learning_rate': 2e-05, 'epoch': 2.77}
311
+ {'loss': 172.3194, 'learning_rate': 2e-05, 'epoch': 2.78}
312
+ {'loss': 210.0355, 'learning_rate': 2e-05, 'epoch': 2.79}
313
+ {'loss': 151.5427, 'learning_rate': 2e-05, 'epoch': 2.8}
314
+ return fn(*args, **kwargs)
315
+ {'eval_loss': 220.8135528564453, 'eval_runtime': 91.7827, 'eval_samples_per_second': 17.432, 'eval_steps_per_second': 1.09, 'epoch': 2.8}
316
+
317
+ {'loss': 216.8114, 'learning_rate': 2e-05, 'epoch': 2.81}
318
+ {'loss': 204.5617, 'learning_rate': 2e-05, 'epoch': 2.82}
319
+ {'loss': 170.5889, 'learning_rate': 2e-05, 'epoch': 2.83}
320
+ {'loss': 207.3868, 'learning_rate': 2e-05, 'epoch': 2.84}
321
+ {'loss': 181.0243, 'learning_rate': 2e-05, 'epoch': 2.85}
322
+ {'loss': 181.3605, 'learning_rate': 2e-05, 'epoch': 2.86}
323
+ {'loss': 151.9068, 'learning_rate': 2e-05, 'epoch': 2.87}
324
+ {'loss': 181.1088, 'learning_rate': 2e-05, 'epoch': 2.88}
325
+ {'loss': 168.5044, 'learning_rate': 2e-05, 'epoch': 2.89}
326
+ {'loss': 169.9193, 'learning_rate': 2e-05, 'epoch': 2.9}
327
+ {'loss': 163.9978, 'learning_rate': 2e-05, 'epoch': 2.91}
328
+ {'loss': 160.5624, 'learning_rate': 2e-05, 'epoch': 2.92}
329
+ {'loss': 168.6609, 'learning_rate': 2e-05, 'epoch': 2.93}
330
+ {'loss': 176.1334, 'learning_rate': 2e-05, 'epoch': 2.94}
331
+ {'loss': 183.0596, 'learning_rate': 2e-05, 'epoch': 2.95}
332
+ {'loss': 137.6645, 'learning_rate': 2e-05, 'epoch': 2.96}
333
+ {'loss': 207.355, 'learning_rate': 2e-05, 'epoch': 2.97}
334
+ {'loss': 142.0311, 'learning_rate': 2e-05, 'epoch': 2.98}
335
+ {'loss': 220.0689, 'learning_rate': 2e-05, 'epoch': 2.99}
336
+ {'loss': 188.8789, 'learning_rate': 2e-05, 'epoch': 3.0}
337
+ {'loss': 125.5, 'learning_rate': 2e-05, 'epoch': 3.01}
338
+ {'loss': 135.3729, 'learning_rate': 2e-05, 'epoch': 3.02}
339
+ {'loss': 138.8647, 'learning_rate': 2e-05, 'epoch': 3.03}
340
+ {'loss': 179.7612, 'learning_rate': 2e-05, 'epoch': 3.04}
341
+ {'loss': 155.7191, 'learning_rate': 2e-05, 'epoch': 3.05}
342
+ {'loss': 142.4713, 'learning_rate': 2e-05, 'epoch': 3.06}
343
+ {'loss': 153.5572, 'learning_rate': 2e-05, 'epoch': 3.07}
344
+ {'loss': 158.0588, 'learning_rate': 2e-05, 'epoch': 3.08}
345
+ {'loss': 188.1761, 'learning_rate': 2e-05, 'epoch': 3.09}
346
+ {'loss': 149.4881, 'learning_rate': 2e-05, 'epoch': 3.1}
347
+ {'loss': 143.047, 'learning_rate': 2e-05, 'epoch': 3.11}
348
+ {'loss': 149.3641, 'learning_rate': 2e-05, 'epoch': 3.12}
349
+ {'loss': 157.5219, 'learning_rate': 2e-05, 'epoch': 3.13}
350
+ {'loss': 160.7546, 'learning_rate': 2e-05, 'epoch': 3.14}
351
+ {'loss': 204.2928, 'learning_rate': 2e-05, 'epoch': 3.15}
352
+ {'loss': 162.4894, 'learning_rate': 2e-05, 'epoch': 3.16}
353
+ {'loss': 142.8077, 'learning_rate': 2e-05, 'epoch': 3.17}
354
+ {'loss': 189.9061, 'learning_rate': 2e-05, 'epoch': 3.18}
355
+ {'loss': 160.648, 'learning_rate': 2e-05, 'epoch': 3.19}
356
+ {'loss': 195.4514, 'learning_rate': 2e-05, 'epoch': 3.2}
357
+ return fn(*args, **kwargs)
358
+ {'eval_loss': 214.12860107421875, 'eval_runtime': 91.6917, 'eval_samples_per_second': 17.45, 'eval_steps_per_second': 1.091, 'epoch': 3.2}
359
+
360
+ {'loss': 165.5777, 'learning_rate': 2e-05, 'epoch': 3.21}
361
+ {'loss': 157.4871, 'learning_rate': 2e-05, 'epoch': 3.22}
362
+ {'loss': 156.9734, 'learning_rate': 2e-05, 'epoch': 3.23}
363
+ {'loss': 171.0179, 'learning_rate': 2e-05, 'epoch': 3.24}
364
+ {'loss': 158.758, 'learning_rate': 2e-05, 'epoch': 3.25}
365
+ {'loss': 180.9356, 'learning_rate': 2e-05, 'epoch': 3.26}
366
+ {'loss': 132.6712, 'learning_rate': 2e-05, 'epoch': 3.27}
367
+ {'loss': 142.0085, 'learning_rate': 2e-05, 'epoch': 3.28}
368
+ {'loss': 122.8798, 'learning_rate': 2e-05, 'epoch': 3.29}
369
+ {'loss': 166.3639, 'learning_rate': 2e-05, 'epoch': 3.3}
370
+ {'loss': 126.7612, 'learning_rate': 2e-05, 'epoch': 3.31}
371
+ {'loss': 188.9171, 'learning_rate': 2e-05, 'epoch': 3.32}
372
+ {'loss': 151.0345, 'learning_rate': 2e-05, 'epoch': 3.33}
373
+ {'loss': 159.7177, 'learning_rate': 2e-05, 'epoch': 3.34}
374
+ {'loss': 150.0818, 'learning_rate': 2e-05, 'epoch': 3.35}
375
+ {'loss': 162.671, 'learning_rate': 2e-05, 'epoch': 3.36}
376
+ {'loss': 129.0101, 'learning_rate': 2e-05, 'epoch': 3.37}
377
+ {'loss': 187.8155, 'learning_rate': 2e-05, 'epoch': 3.38}
378
+ {'loss': 136.3083, 'learning_rate': 2e-05, 'epoch': 3.39}
379
+ {'loss': 139.6308, 'learning_rate': 2e-05, 'epoch': 3.4}
380
+ {'loss': 158.1506, 'learning_rate': 2e-05, 'epoch': 3.41}
381
+ {'loss': 152.7077, 'learning_rate': 2e-05, 'epoch': 3.42}
382
+ {'loss': 173.8158, 'learning_rate': 2e-05, 'epoch': 3.43}
383
+ {'loss': 128.805, 'learning_rate': 2e-05, 'epoch': 3.44}
384
+ {'loss': 152.3222, 'learning_rate': 2e-05, 'epoch': 3.45}
385
+ {'loss': 163.3487, 'learning_rate': 2e-05, 'epoch': 3.46}
386
+ {'loss': 169.0825, 'learning_rate': 2e-05, 'epoch': 3.47}
387
+ {'loss': 156.5232, 'learning_rate': 2e-05, 'epoch': 3.48}
388
+ {'loss': 188.6721, 'learning_rate': 2e-05, 'epoch': 3.49}
389
+ {'loss': 201.6223, 'learning_rate': 2e-05, 'epoch': 3.5}
390
+ {'loss': 294.7936, 'learning_rate': 2e-05, 'epoch': 3.51}
391
+ {'loss': 155.2639, 'learning_rate': 2e-05, 'epoch': 3.52}
392
+ {'loss': 148.6182, 'learning_rate': 2e-05, 'epoch': 3.53}
393
+ {'loss': 207.8028, 'learning_rate': 2e-05, 'epoch': 3.54}
394
+ {'loss': 163.1711, 'learning_rate': 2e-05, 'epoch': 3.55}
395
+ {'loss': 162.5552, 'learning_rate': 2e-05, 'epoch': 3.56}
396
+ {'loss': 167.8712, 'learning_rate': 2e-05, 'epoch': 3.57}
397
+ {'loss': 155.6208, 'learning_rate': 2e-05, 'epoch': 3.58}
398
+ {'loss': 178.2028, 'learning_rate': 2e-05, 'epoch': 3.59}
399
+ {'loss': 174.9905, 'learning_rate': 2e-05, 'epoch': 3.6}
400
+ return fn(*args, **kwargs)
401
+ {'eval_loss': 211.39349365234375, 'eval_runtime': 91.4666, 'eval_samples_per_second': 17.493, 'eval_steps_per_second': 1.093, 'epoch': 3.6}
402
+
403
+ {'loss': 149.9229, 'learning_rate': 2e-05, 'epoch': 3.61}
404
+ {'loss': 166.3071, 'learning_rate': 2e-05, 'epoch': 3.62}
405
+ {'loss': 124.036, 'learning_rate': 2e-05, 'epoch': 3.63}
406
+ {'loss': 172.7505, 'learning_rate': 2e-05, 'epoch': 3.64}
407
+ {'loss': 148.0122, 'learning_rate': 2e-05, 'epoch': 3.65}
408
+ {'loss': 185.0846, 'learning_rate': 2e-05, 'epoch': 3.66}
409
+ {'loss': 153.4166, 'learning_rate': 2e-05, 'epoch': 3.67}
410
+ {'loss': 144.2338, 'learning_rate': 2e-05, 'epoch': 3.68}
411
+ {'loss': 158.5771, 'learning_rate': 2e-05, 'epoch': 3.69}
412
+ {'loss': 163.8886, 'learning_rate': 2e-05, 'epoch': 3.7}
413
+ {'loss': 151.2742, 'learning_rate': 2e-05, 'epoch': 3.71}
414
+ {'loss': 169.2691, 'learning_rate': 2e-05, 'epoch': 3.72}
415
+ {'loss': 125.0493, 'learning_rate': 2e-05, 'epoch': 3.73}
416
+ {'loss': 144.3527, 'learning_rate': 2e-05, 'epoch': 3.74}
417
+ {'loss': 210.2006, 'learning_rate': 2e-05, 'epoch': 3.75}
418
+ {'loss': 162.8882, 'learning_rate': 2e-05, 'epoch': 3.76}
419
+ {'loss': 163.0425, 'learning_rate': 2e-05, 'epoch': 3.77}
420
+ {'loss': 144.6404, 'learning_rate': 2e-05, 'epoch': 3.78}
421
+ {'loss': 169.7259, 'learning_rate': 2e-05, 'epoch': 3.79}
422
+ {'loss': 117.309, 'learning_rate': 2e-05, 'epoch': 3.8}
423
+ {'loss': 179.2435, 'learning_rate': 2e-05, 'epoch': 3.81}
424
+ {'loss': 156.4202, 'learning_rate': 2e-05, 'epoch': 3.82}
425
+ {'loss': 212.8055, 'learning_rate': 2e-05, 'epoch': 3.83}
426
+ {'loss': 130.1424, 'learning_rate': 2e-05, 'epoch': 3.84}
427
+ {'loss': 143.6542, 'learning_rate': 2e-05, 'epoch': 3.85}
428
+ {'loss': 193.6444, 'learning_rate': 2e-05, 'epoch': 3.86}
429
+ {'loss': 176.6723, 'learning_rate': 2e-05, 'epoch': 3.87}
430
+ {'loss': 150.6032, 'learning_rate': 2e-05, 'epoch': 3.88}
431
+ {'loss': 146.1843, 'learning_rate': 2e-05, 'epoch': 3.89}
432
+ {'loss': 152.6586, 'learning_rate': 2e-05, 'epoch': 3.9}
433
+ {'loss': 162.0343, 'learning_rate': 2e-05, 'epoch': 3.91}
434
+ {'loss': 157.9043, 'learning_rate': 2e-05, 'epoch': 3.92}
435
+ {'loss': 140.6674, 'learning_rate': 2e-05, 'epoch': 3.93}
436
+ {'loss': 186.9754, 'learning_rate': 2e-05, 'epoch': 3.94}
437
+ {'loss': 157.4324, 'learning_rate': 2e-05, 'epoch': 3.95}
438
+ {'loss': 151.3968, 'learning_rate': 2e-05, 'epoch': 3.96}
439
+ {'loss': 182.1434, 'learning_rate': 2e-05, 'epoch': 3.97}
440
+ {'loss': 164.4491, 'learning_rate': 2e-05, 'epoch': 3.98}
441
+ {'loss': 151.253, 'learning_rate': 2e-05, 'epoch': 3.99}
442
+ {'loss': 136.0101, 'learning_rate': 2e-05, 'epoch': 4.0}
443
+
444
+ {'eval_loss': 206.38494873046875, 'eval_runtime': 91.5142, 'eval_samples_per_second': 17.484, 'eval_steps_per_second': 1.093, 'epoch': 4.0}
445
+ {'train_runtime': 5141.9779, 'train_samples_per_second': 4.979, 'train_steps_per_second': 0.078, 'train_loss': 280.0906865310669, 'epoch': 4.0}
wandb/run-20250408_142458-wlfced8t/files/requirements.txt ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==58.1.0
2
+ pip==23.0.1
3
+ wcwidth==0.2.13
4
+ triton==3.2.0
5
+ sqlitedict==2.1.0
6
+ sentencepiece==0.2.0
7
+ pytz==2025.2
8
+ py-cpuinfo==9.0.0
9
+ pure_eval==0.2.3
10
+ ptyprocess==0.7.0
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ hjson==3.1.0
14
+ Fraction==2.2.0
15
+ antlr4-python3-runtime==4.9.3
16
+ zstandard==0.23.0
17
+ zipp==3.21.0
18
+ xxhash==3.5.0
19
+ urllib3==2.3.0
20
+ tzdata==2025.2
21
+ typing_extensions==4.13.1
22
+ traitlets==5.14.3
23
+ tqdm==4.67.1
24
+ tornado==6.4.2
25
+ threadpoolctl==3.6.0
26
+ tcolorpy==0.1.7
27
+ tabulate==0.9.0
28
+ sympy==1.13.1
29
+ smmap==5.0.2
30
+ six==1.17.0
31
+ setproctitle==1.3.5
32
+ safetensors==0.5.3
33
+ regex==2024.11.6
34
+ pyzmq==26.4.0
35
+ PyYAML==6.0.2
36
+ Pygments==2.19.1
37
+ pycountry==24.6.1
38
+ pyarrow==19.0.1
39
+ psutil==7.0.0
40
+ protobuf==5.29.4
41
+ propcache==0.3.1
42
+ prompt_toolkit==3.0.50
43
+ portalocker==3.1.1
44
+ platformdirs==4.3.7
45
+ pexpect==4.9.0
46
+ pathvalidate==3.2.3
47
+ parso==0.8.4
48
+ packaging==24.2
49
+ nvidia-nvtx-cu12==12.4.127
50
+ nvidia-nvjitlink-cu12==12.4.127
51
+ nvidia-nccl-cu12==2.21.5
52
+ nvidia-curand-cu12==10.3.5.147
53
+ nvidia-cufft-cu12==11.2.1.3
54
+ nvidia-cuda-runtime-cu12==12.4.127
55
+ nvidia-cuda-nvrtc-cu12==12.4.127
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ nvidia-cublas-cu12==12.4.5.8
58
+ numpy==2.0.2
59
+ ninja==1.11.1.4
60
+ networkx==3.2.1
61
+ nest-asyncio==1.6.0
62
+ msgpack==1.1.0
63
+ MarkupSafe==3.0.2
64
+ lxml==5.3.2
65
+ joblib==1.4.2
66
+ idna==3.10
67
+ fsspec==2024.12.0
68
+ frozenlist==1.5.0
69
+ filelock==3.18.0
70
+ executing==2.2.0
71
+ exceptiongroup==1.2.2
72
+ eval_type_backport==0.2.2
73
+ einops==0.8.1
74
+ dill==0.3.8
75
+ decorator==5.2.1
76
+ debugpy==1.8.13
77
+ colorama==0.4.6
78
+ click==8.1.8
79
+ charset-normalizer==3.4.1
80
+ chardet==5.2.0
81
+ certifi==2025.1.31
82
+ attrs==25.3.0
83
+ async-timeout==5.0.1
84
+ asttokens==3.0.0
85
+ annotated-types==0.7.0
86
+ aiohappyeyeballs==2.6.1
87
+ absl-py==2.2.2
88
+ typing-inspection==0.4.0
89
+ tqdm-multiprocess==0.0.11
90
+ tensorboardX==2.6.2.2
91
+ stack-data==0.6.3
92
+ sentry-sdk==2.25.1
93
+ scipy==1.13.1
94
+ sacrebleu==2.5.1
95
+ requests==2.32.3
96
+ python-dateutil==2.9.0.post0
97
+ pydantic_core==2.33.1
98
+ omegaconf==2.3.0
99
+ nvidia-cusparse-cu12==12.3.1.170
100
+ nvidia-cudnn-cu12==9.1.0.70
101
+ numexpr==2.10.2
102
+ nltk==3.9.1
103
+ multiprocess==0.70.16
104
+ multidict==6.3.2
105
+ mbstrdecoder==1.1.4
106
+ matplotlib-inline==0.1.7
107
+ jupyter_core==5.7.2
108
+ jsonlines==4.0.0
109
+ Jinja2==3.1.6
110
+ jedi==0.19.2
111
+ importlib_resources==6.5.2
112
+ importlib_metadata==8.6.1
113
+ gitdb==4.0.12
114
+ docker-pycreds==0.4.0
115
+ comm==0.2.2
116
+ aiosignal==1.3.2
117
+ yarl==1.19.0
118
+ typepy==1.3.4
119
+ scikit-learn==1.6.1
120
+ rouge-score==0.1.2
121
+ pydantic==2.11.3
122
+ pandas==2.2.3
123
+ nvidia-cusolver-cu12==11.6.1.9
124
+ jupyter_client==8.6.3
125
+ ipython==8.18.1
126
+ huggingface-hub==0.30.2
127
+ GitPython==3.1.44
128
+ torch==2.6.0
129
+ tokenizers==0.15.2
130
+ ipykernel==6.29.5
131
+ aiohttp==3.11.16
132
+ transformers==4.37.0
133
+ deepspeed==0.16.5
134
+ DataProperty==1.1.0
135
+ bitsandbytes==0.45.5
136
+ accelerate==0.28.0
137
+ tabledata==1.3.4
138
+ peft==0.8.0
139
+ datasets==3.5.0
140
+ pytablewriter==1.2.1
141
+ evaluate==0.4.3
142
+ wandb==0.19.9
wandb/run-20250408_142458-wlfced8t/files/wandb-metadata.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-135-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.9.21",
4
+ "startedAt": "2025-04-08T14:24:58.120164Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "../models/TinyLlama_v1.1/",
9
+ "--data_path",
10
+ "../data/datasets/Llama-2-7b-hf/mix_wiki_alpaca_8000.json",
11
+ "--model_max_length",
12
+ "1024",
13
+ "--output_dir",
14
+ "./ckpts/tinyllama_v1.1/int2-g128/",
15
+ "--logging_dir",
16
+ "./ckpts/tinyllama_v1.1/int2-g128/runs/",
17
+ "--num_train_epochs",
18
+ "4",
19
+ "--bf16",
20
+ "True",
21
+ "--seed",
22
+ "42",
23
+ "--per_device_train_batch_size",
24
+ "16",
25
+ "--per_device_eval_batch_size",
26
+ "16",
27
+ "--gradient_accumulation_steps",
28
+ "4",
29
+ "--gradient_checkpointing",
30
+ "True",
31
+ "--evaluation_strategy",
32
+ "steps",
33
+ "--eval_steps",
34
+ "40",
35
+ "--load_best_model_at_end",
36
+ "True",
37
+ "--save_strategy",
38
+ "steps",
39
+ "--save_steps",
40
+ "40",
41
+ "--save_total_limit",
42
+ "2",
43
+ "--learning_rate",
44
+ "2e-5",
45
+ "--lr_scheduler_type",
46
+ "constant",
47
+ "--weight_decay",
48
+ "0.",
49
+ "--logging_steps",
50
+ "1",
51
+ "--report_to",
52
+ "tensorboard",
53
+ "wandb",
54
+ "--deepspeed",
55
+ "config/zero.json",
56
+ "--bits",
57
+ "2",
58
+ "--quant_type",
59
+ "int2-asym",
60
+ "--q_group_size",
61
+ "128",
62
+ "--train_kd",
63
+ "True",
64
+ "--kd_loss_type",
65
+ "cakld",
66
+ "--max_train_samples",
67
+ "999999",
68
+ "--clip",
69
+ "../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt"
70
+ ],
71
+ "program": "/workspace/BitDistiller/train/train.py",
72
+ "codePath": "train/train.py",
73
+ "git": {
74
+ "remote": "[email protected]:BrownianNotion/BitDistiller.git",
75
+ "commit": "17dbabd8bb62295551e4244147374632ce7a2287"
76
+ },
77
+ "email": "[email protected]",
78
+ "root": "./ckpts/tinyllama_v1.1/int2-g128/",
79
+ "host": "816cf5acb821",
80
+ "executable": "/workspace/BitDistiller/BitDistillerVenv/bin/python3.9",
81
+ "codePathLocal": "train.py",
82
+ "cpu_count": 32,
83
+ "cpu_count_logical": 64,
84
+ "gpu": "NVIDIA A100-PCIE-40GB",
85
+ "gpu_count": 1,
86
+ "disk": {
87
+ "/": {
88
+ "total": "213674622976",
89
+ "used": "15038046208"
90
+ }
91
+ },
92
+ "memory": {
93
+ "total": "270095581184"
94
+ },
95
+ "cpu": {
96
+ "count": 32,
97
+ "countLogical": 64
98
+ },
99
+ "gpu_nvidia": [
100
+ {
101
+ "name": "NVIDIA A100-PCIE-40GB",
102
+ "memoryTotal": "42949672960",
103
+ "cudaCores": 6912,
104
+ "architecture": "Ampere"
105
+ }
106
+ ],
107
+ "cudaVersion": "12.8"
108
+ }
wandb/run-20250408_142458-wlfced8t/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":5187},"train/train_loss":280.0906865310669,"eval/steps_per_second":1.093,"eval/runtime":91.5142,"train/total_flos":1.627150507573248e+17,"eval/samples_per_second":17.484,"train/train_steps_per_second":0.078,"train/epoch":4,"train/global_step":400,"_runtime":5183.891265556,"train/learning_rate":2e-05,"_timestamp":1.7441274820110373e+09,"train/loss":136.0101,"eval/loss":206.38494873046875,"train/train_samples_per_second":4.979,"_step":410,"train/train_runtime":5141.9779}
wandb/run-20250408_142458-wlfced8t/logs/debug-core.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:24:57.609759289Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp7r_i2fib/port-9420.txt","pid":9420,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-04-08T14:24:57.626067579Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":9420}
3
+ {"time":"2025-04-08T14:24:57.626060178Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":33329,"Zone":""}}
4
+ {"time":"2025-04-08T14:24:57.799461315Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:40764"}
5
+ {"time":"2025-04-08T14:24:58.122235867Z","level":"INFO","msg":"handleInformInit: received","streamId":"wlfced8t","id":"127.0.0.1:40764"}
6
+ {"time":"2025-04-08T14:24:58.380862144Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"wlfced8t","id":"127.0.0.1:40764"}
7
+ {"time":"2025-04-08T15:51:25.956066962Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"wlfced8t","id":"127.0.0.1:40764"}
8
+ {"time":"2025-04-08T15:51:25.956245656Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"wlfced8t","id":"127.0.0.1:40764"}
9
+ {"time":"2025-04-08T15:51:26.954886538Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:40764"}
10
+ {"time":"2025-04-08T15:51:26.954913569Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:40764"}
11
+ {"time":"2025-04-08T15:51:26.954921999Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-04-08T15:51:26.95495002Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:40764"}
13
+ {"time":"2025-04-08T15:51:26.955030402Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:40764"}
14
+ {"time":"2025-04-08T15:51:26.955048092Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:40764"}
15
+ {"time":"2025-04-08T15:51:26.955060042Z","level":"INFO","msg":"server is closed"}
wandb/run-20250408_142458-wlfced8t/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:24:58.122459682Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_142458-wlfced8t/logs/debug-core.log"}
2
+ {"time":"2025-04-08T14:24:58.380784982Z","level":"INFO","msg":"created new stream","id":"wlfced8t"}
3
+ {"time":"2025-04-08T14:24:58.380854043Z","level":"INFO","msg":"stream: started","id":"wlfced8t"}
4
+ {"time":"2025-04-08T14:24:58.380871954Z","level":"INFO","msg":"writer: Do: started","stream_id":"wlfced8t"}
5
+ {"time":"2025-04-08T14:24:58.380921165Z","level":"INFO","msg":"sender: started","stream_id":"wlfced8t"}
6
+ {"time":"2025-04-08T14:24:58.380954676Z","level":"INFO","msg":"handler: started","stream_id":"wlfced8t"}
7
+ {"time":"2025-04-08T14:24:58.59144642Z","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-04-08T15:51:25.126053606Z","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-04-08T15:51:25.126119508Z","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-04-08T15:51:25.715478158Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2025-04-08T15:51:25.950965444Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2025-04-08T15:51:25.956097933Z","level":"INFO","msg":"stream: closing","id":"wlfced8t"}
13
+ {"time":"2025-04-08T15:51:25.956113153Z","level":"INFO","msg":"handler: closed","stream_id":"wlfced8t"}
14
+ {"time":"2025-04-08T15:51:25.956122893Z","level":"INFO","msg":"writer: Close: closed","stream_id":"wlfced8t"}
15
+ {"time":"2025-04-08T15:51:25.956130383Z","level":"INFO","msg":"sender: closed","stream_id":"wlfced8t"}
16
+ {"time":"2025-04-08T15:51:25.956237656Z","level":"INFO","msg":"stream: closed","id":"wlfced8t"}
wandb/run-20250408_142458-wlfced8t/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9
2
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Configure stats pid to 9420
3
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings
4
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Loading settings from /workspace/BitDistiller/train/wandb/settings
5
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-08 14:24:58,116 INFO MainThread:9420 [wandb_init.py:setup_run_log_directory():662] Logging user logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_142458-wlfced8t/logs/debug.log
7
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_142458-wlfced8t/logs/debug-internal.log
8
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():781] calling init triggers
9
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():786] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():809] starting backend
12
+ 2025-04-08 14:24:58,117 INFO MainThread:9420 [wandb_init.py:init():813] sending inform_init request
13
+ 2025-04-08 14:24:58,119 INFO MainThread:9420 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-08 14:24:58,119 INFO MainThread:9420 [wandb_init.py:init():823] backend started and connected
15
+ 2025-04-08 14:24:58,124 INFO MainThread:9420 [wandb_init.py:init():915] updated telemetry
16
+ 2025-04-08 14:24:58,304 INFO MainThread:9420 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout
17
+ 2025-04-08 14:24:58,589 INFO MainThread:9420 [wandb_init.py:init():1014] starting run threads in backend
18
+ 2025-04-08 14:24:58,680 INFO MainThread:9420 [wandb_run.py:_console_start():2454] atexit reg
19
+ 2025-04-08 14:24:58,681 INFO MainThread:9420 [wandb_run.py:_redirect():2306] redirect: wrap_raw
20
+ 2025-04-08 14:24:58,681 INFO MainThread:9420 [wandb_run.py:_redirect():2371] Wrapping output streams.
21
+ 2025-04-08 14:24:58,681 INFO MainThread:9420 [wandb_run.py:_redirect():2394] Redirects installed.
22
+ 2025-04-08 14:24:58,682 INFO MainThread:9420 [wandb_init.py:init():1056] run started, returning control to user process
23
+ 2025-04-08 14:25:40,038 INFO MainThread:9420 [wandb_run.py:_config_callback():1327] config_cb None None {'vocab_size': 32001, 'max_position_embeddings': 2048, 'hidden_size': 2048, 'intermediate_size': 5632, 'num_hidden_layers': 22, 'num_attention_heads': 32, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '../models/TinyLlama_v1.1/', 'transformers_version': '4.37.0', 'model_type': 'llama', 'output_dir': './ckpts/tinyllama_v1.1/int2-g128/', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 4.0, 'max_steps': -1, 'lr_scheduler_type': 'constant', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './ckpts/tinyllama_v1.1/int2-g128/runs/', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 40, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 40, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './ckpts/tinyllama_v1.1/int2-g128/', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': 'config/zero.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'cache_dir': None, 'model_max_length': 1024, 'bits': 2, 'q_group_size': 128, 'quant_type': 'int2-asym', 'clip': '../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt', 'train_kd': True, 'kd_tmp': 1, 'kd_loss_type': 'cakld', 'cakld_steps': 10}
24
+ 2025-04-08 15:51:25,124 INFO MainThread:9420 [wandb_run.py:_finish():2189] finishing run DeepFriedNLP/SNLP_BitDistiller/wlfced8t
25
+ 2025-04-08 15:51:25,125 INFO MainThread:9420 [wandb_run.py:_atexit_cleanup():2419] got exitcode: 0
26
+ 2025-04-08 15:51:25,125 INFO MainThread:9420 [wandb_run.py:_restore():2401] restore
27
+ 2025-04-08 15:51:25,125 INFO MainThread:9420 [wandb_run.py:_restore():2407] restore done
28
+ 2025-04-08 15:51:25,953 INFO MainThread:9420 [wandb_run.py:_footer_history_summary_info():4064] rendering history
29
+ 2025-04-08 15:51:25,954 INFO MainThread:9420 [wandb_run.py:_footer_history_summary_info():4096] rendering summary
30
+ 2025-04-08 15:51:25,955 INFO MainThread:9420 [wandb_run.py:_footer_sync_info():4025] logging synced files
wandb/run-20250408_142458-wlfced8t/run-wlfced8t.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d1ddb4b228042072d2a447f925cc0f73ee98db6218e72b836880c8ee4381d4
3
+ size 1263539