diff --git "a/wandb/run-20210709_144100-2k1kyrq2/files/output.log" "b/wandb/run-20210709_144100-2k1kyrq2/files/output.log"
deleted file mode 100644--- "a/wandb/run-20210709_144100-2k1kyrq2/files/output.log"
+++ /dev/null
@@ -1,428 +0,0 @@
-INFO:__main__:Training/evaluation parameters TrainingArguments(
-_n_gpu=0,
-adafactor=False,
-adam_beta1=0.9,
-adam_beta2=0.98,
-adam_epsilon=1e-08,
-dataloader_drop_last=False,
-dataloader_num_workers=64,
-dataloader_pin_memory=True,
-ddp_find_unused_parameters=None,
-debug=[],
-deepspeed=None,
-disable_tqdm=False,
-do_eval=True,
-do_predict=False,
-do_train=True,
-eval_accumulation_steps=None,
-eval_steps=5000,
-evaluation_strategy=IntervalStrategy.NO,
-fp16=False,
-fp16_backend=auto,
-fp16_full_eval=False,
-fp16_opt_level=O1,
-gradient_accumulation_steps=1,
-greater_is_better=None,
-group_by_length=False,
-ignore_data_skip=False,
-label_names=None,
-label_smoothing_factor=0.0,
-learning_rate=0.0024,
-length_column_name=length,
-load_best_model_at_end=False,
-local_rank=-1,
-log_level=-1,
-log_level_replica=-1,
-log_on_each_node=True,
-logging_dir=/home/cahya/Work/flax-community/gpt2-medium-indonesian/runs/Jul09_14-41-04_t1v-n-528d9406-w-0,
-logging_first_step=False,
-logging_steps=5000,
-logging_strategy=IntervalStrategy.STEPS,
-lr_scheduler_type=SchedulerType.LINEAR,
-max_grad_norm=1.0,
-max_steps=-1,
-metric_for_best_model=None,
-mp_parameters=,
-no_cuda=False,
-num_train_epochs=20.0,
-output_dir=/home/cahya/Work/flax-community/gpt2-medium-indonesian,
-overwrite_output_dir=True,
-past_index=-1,
-per_device_eval_batch_size=24,
-per_device_train_batch_size=24,
-prediction_loss_only=False,
-push_to_hub=True,
-push_to_hub_model_id=gpt2-medium-indonesian,
-push_to_hub_organization=None,
-push_to_hub_token=None,
-remove_unused_columns=True,
-report_to=['tensorboard', 'wandb'],
-resume_from_checkpoint=None,
-run_name=/home/cahya/Work/flax-community/gpt2-medium-indonesian,
-save_on_each_node=False,
-save_steps=5000,
-save_strategy=IntervalStrategy.STEPS,
-save_total_limit=None,
-seed=42,
-sharded_ddp=[],
-skip_memory_metrics=True,
-tpu_metrics_debug=False,
-tpu_num_cores=None,
-use_legacy_prediction_loop=False,
-warmup_ratio=0.0,
-warmup_steps=1000,
-weight_decay=0.01,
-)
-WARNING:datasets.builder:Reusing dataset oscar (/home/cahya/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_id/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
-WARNING:datasets.builder:Reusing dataset oscar (/home/cahya/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_id/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
-WARNING:datasets.builder:Reusing dataset oscar (/home/cahya/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_id/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
-loading configuration file /home/cahya/Work/flax-community/gpt2-medium-indonesian/config.json
-Model config GPT2Config {
-  "activation_function": "gelu_new",
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.0,
-  "bos_token_id": 50256,
-  "embd_pdrop": 0.0,
-  "eos_token_id": 50256,
-  "gradient_checkpointing": false,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 1024,
-  "n_head": 16,
-  "n_inner": null,
-  "n_layer": 24,
-  "n_positions": 1024,
-  "n_special": 0,
-  "predict_special_tokens": true,
-  "resid_pdrop": 0.0,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "task_specific_params": {
-    "text-generation": {
-      "do_sample": true,
-      "max_length": 50
-    }
-  },
-  "transformers_version": "4.9.0.dev0",
-  "use_cache": true,
-  "vocab_size": 50257
-}
-Could not locate the tokenizer configuration file, will try to use the model config instead.
-loading configuration file /home/cahya/Work/flax-community/gpt2-medium-indonesian/config.json
-Model config GPT2Config {
-  "activation_function": "gelu_new",
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.0,
-  "bos_token_id": 50256,
-  "embd_pdrop": 0.0,
-  "eos_token_id": 50256,
-  "gradient_checkpointing": false,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 1024,
-  "n_head": 16,
-  "n_inner": null,
-  "n_layer": 24,
-  "n_positions": 1024,
-  "n_special": 0,
-  "predict_special_tokens": true,
-  "resid_pdrop": 0.0,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "task_specific_params": {
-    "text-generation": {
-      "do_sample": true,
-      "max_length": 50
-    }
-  },
-  "transformers_version": "4.9.0.dev0",
-  "use_cache": true,
-  "vocab_size": 50257
-}
-Didn't find file /home/cahya/Work/flax-community/gpt2-medium-indonesian/vocab.json. We won't load it.
-Didn't find file /home/cahya/Work/flax-community/gpt2-medium-indonesian/merges.txt. We won't load it.
-Didn't find file /home/cahya/Work/flax-community/gpt2-medium-indonesian/added_tokens.json. We won't load it.
-Didn't find file /home/cahya/Work/flax-community/gpt2-medium-indonesian/special_tokens_map.json. We won't load it.
-Didn't find file /home/cahya/Work/flax-community/gpt2-medium-indonesian/tokenizer_config.json. We won't load it.
-loading file None
-loading file None
-loading file /home/cahya/Work/flax-community/gpt2-medium-indonesian/tokenizer.json
-loading file None
-loading file None
-loading file None
- #0:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #1:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #2:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #3:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #4:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #5:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #6:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #7:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #8:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #9:   0%|                                                                                                                                                                                                                   | 0/153 [00:00<?, ?ba/s]
- #10:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #11:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #12:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #13:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #14:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #15:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #16:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #17:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #18:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #19:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #20:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #21:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #22:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #23:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #24:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<03:24,  1.35s/ba]
- #25:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:07,  1.63s/ba]
- #26:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:17,  1.70s/ba]
- #27:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<03:40,  1.45s/ba]
- #28:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<02:45,  1.09s/ba]
- #29:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<03:56,  1.56s/ba]
- #30:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:02,  1.59s/ba]
- #31:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:03,  1.60s/ba]
- #32:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:23,  1.73s/ba]
- #33:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:08,  1.64s/ba]
- #34:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:02,  1.60s/ba]
- #35:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #36:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #37:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #38:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #39:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #40:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #41:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #42:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #43:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #44:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #45:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #46:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #47:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #48:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #49:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #50:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #51:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #52:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #53:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- ... (more hidden) ...
-
- #48:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 148/153 [03:38<00:07,  1.50s/ba]
- #52:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 146/153 [03:37<00:10,  1.56s/ba]
- #53:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 147/153 [03:37<00:09,  1.62s/ba]
- ... (more hidden) ...
- #52:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 148/153 [03:41<00:08,  1.64s/ba]
- ... (more hidden) ...
- #53:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 150/153 [03:41<00:04,  1.40s/ba]
- #52:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 150/153 [03:43<00:04,  1.52s/ba]
- #53:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [03:42<00:02,  1.35s/ba]
- ... (more hidden) ...
- #53:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 152/153 [03:44<00:01,  1.34s/ba]
- #53: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [03:44<00:00,  1.08s/ba]
- #52:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [03:45<00:02,  1.47s/ba]
- #52:  99%|████████████████████████████████████████████████████████████████████████████████████��█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 152/153 [03:46<00:01,  1.46s/ba]
- #43: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [03:50<00:00,  1.07s/ba]
- #44:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [03:50<00:02,  1.41s/ba]
- #44:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 152/153 [03:52<00:01,  1.47s/ba]
- #44: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [03:52<00:00,  1.12s/ba]
- #39:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 150/153 [03:54<00:03,  1.30s/ba]
- #36:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 149/153 [03:55<00:05,  1.39s/ba]
- #34:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 150/153 [03:57<00:04,  1.44s/ba]
- #34:  99%|█████████████████████████████████████████████████████████████████████████████���███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [03:59<00:02,  1.40s/ba]
- #36:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [03:58<00:02,  1.33s/ba]
- #36:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 152/153 [03:59<00:01,  1.26s/ba]
- #36: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [03:59<00:00,  1.01ba/s]
- #27:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #28:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #29:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #31:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:05,  1.90s/ba]
- #32:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:05,  1.81s/ba]
- #33:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:04,  1.62s/ba]
- #34:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:04,  1.65s/ba]
- #35:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:05,  1.76s/ba]
- #36:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:05,  1.82s/ba]
- #37:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:05,  1.79s/ba]
- #38:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #39:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #39:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:05,  1.75s/ba]
- #41:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #41:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:04,  1.64s/ba]
- #43:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #44:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #45:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:03,  1.10s/ba]
- #46:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #47:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #48:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #49:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #50:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #50:  25%|███████████████████████████████████████████████████                                                                                                                                                         | 1/4 [00:01<00:03,  1.15s/ba]
- #52:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #53:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #44:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:03<00:03,  1.59s/ba]
- #49:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:02<00:02,  1.21s/ba]
- ... (more hidden) ...
- #46:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:03<00:03,  1.53s/ba]
- #45:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:03<00:01,  1.05s/ba]
- #51:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:02<00:02,  1.27s/ba]
- #43:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:03<00:03,  1.74s/ba]
- #53:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:02<00:02,  1.28s/ba]
- #48:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:03<00:03,  1.61s/ba]
- #35:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.48s/ba]
- #21:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:05<00:01,  1.79s/ba]
- #21: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.26s/ba]
- #35: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.06ba/s]
- #37:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.48s/ba]
- #52:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:03<00:01,  1.14s/ba]
- #29: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.07s/ba]
- #40:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.40s/ba]
- #42:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.32s/ba]
- #40: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.12ba/s]
- #38:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.44s/ba]
- #37: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████��██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.04ba/s]
- #42: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.19ba/s]
- #46:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.26s/ba]
- #38: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.09ba/s]
- #41:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.68s/ba]
- #43:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.57s/ba]
- #43: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.01ba/s]
- #41: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.06s/ba]
- #48:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.45s/ba]
- #51:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:04<00:01,  1.46s/ba]
- #51: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.08ba/s]
- #48: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.08ba/s]
- #47: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.04s/ba]
- #38:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #39:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:01<04:15,  1.68s/ba]
- #40:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #41:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #42:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #43:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #44:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #45:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #46:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #47:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #48:   1%|█▎                                                                                                                                                                                                        | 1/153 [00:02<05:41,  2.25s/ba]
- #49:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #50:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #51:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #52:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #53:   0%|                                                                                                                                                                                                                  | 0/153 [00:00<?, ?ba/s]
- #53:  79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 121/153 [12:25<04:19,  8.10s/ba]
- ... (more hidden) ...
-
- #53:  86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 131/153 [13:35<02:16,  6.18s/ba]
- #53:  86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 132/153 [13:41<02:07,  6.05s/ba]
- ... (more hidden) ...
- #53:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 133/153 [13:45<01:53,  5.66s/ba]
-
- #52:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 138/153 [13:56<01:54,  7.63s/ba]
- ... (more hidden) ...
- #52:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 139/153 [14:01<01:33,  6.67s/ba]
-
- #53:  90%|███████████████████████████████████████���███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 137/153 [14:04<01:17,  4.81s/ba]
- #50:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 142/153 [14:09<01:25,  7.78s/ba]
- #53:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 138/153 [14:08<01:09,  4.65s/ba]
- #52:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 141/153 [14:10<01:08,  5.71s/ba]
- #53:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                  | 139/153 [14:12<01:00,  4.35s/ba]
-
- #52:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 144/153 [14:24<00:44,  4.89s/ba]
-
-
- #53:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 144/153 [14:33<00:38,  4.23s/ba]
-
-
- #52:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 149/153 [14:45<00:16,  4.08s/ba]
- #52:  98%|█████████████████████████████████████████████████████████████████████████████████████████████���██████████████████████████████████████████████████████████████████████████████████████████████████████    | 150/153 [14:49<00:12,  4.02s/ba]
- #52:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [14:52<00:07,  3.83s/ba]
- #53:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 149/153 [14:52<00:15,  3.87s/ba]
- #53:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 150/153 [14:56<00:11,  3.70s/ba]
- #53:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [14:59<00:07,  3.53s/ba]
-
- #43:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 133/153 [15:27<00:44,  2.23s/ba]
-
- #42:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 138/153 [15:31<00:33,  2.21s/ba]
-
-
- #43:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 138/153 [15:38<00:29,  1.94s/ba]
- #44:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 145/153 [15:38<00:19,  2.44s/ba]
- #41: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [15:42<00:00,  1.60s/ba]
- #42:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 143/153 [15:41<00:19,  1.97s/ba]
- #44:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 146/153 [15:40<00:17,  2.45s/ba]
- #43:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 141/153 [15:44<00:23,  1.98s/ba]
- #43:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 142/153 [15:46<00:20,  1.91s/ba]
- #43:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 143/153 [15:47<00:18,  1.80s/ba]
-
- #43:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 145/153 [15:51<00:13,  1.66s/ba]
- #44:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████��███████████████████████████████████▋ | 152/153 [15:51<00:01,  1.85s/ba]
- #44: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [15:52<00:00,  1.36s/ba]
- #42:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 151/153 [15:54<00:03,  1.62s/ba]
-
- #43:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 148/153 [15:55<00:07,  1.54s/ba]
- #43:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 150/153 [15:58<00:04,  1.47s/ba]
-
- #25:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #26:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #27:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #28:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #29:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #30:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #31:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #32:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #33:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #34:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #35:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #36:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #37:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #38:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #39:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #40:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #41:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #42:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #43:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #44:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #45:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #46:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #47:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #48:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #49:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #50:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #51:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #52:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- #53:   0%|                                                                                                                                                                                                                    | 0/4 [00:00<?, ?ba/s]
- ... (more hidden) ...
- #51:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████���███████████████████████████████████████████                                                   | 3/4 [00:11<00:04,  4.14s/ba]
- #53:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:10<00:10,  5.42s/ba]
- #49:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:13<00:04,  4.30s/ba]
- ... (more hidden) ...
- #52:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:15<00:15,  7.93s/ba]
- #53:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:17<00:06,  6.12s/ba]
- #47: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  3.73s/ba]
- #52:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:20<00:06,  6.29s/ba]
- #41:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                      | 2/4 [00:24<00:23, 11.71s/ba]
- #45:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:24<00:06,  6.93s/ba]
- #42:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:27<00:07,  7.79s/ba]
- #46:  75%|████████████████████████████████��████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:26<00:08,  8.12s/ba]
- #29:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:29<00:08,  8.86s/ba]
- #33:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:29<00:08,  8.45s/ba]
- #37:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:29<00:08,  8.43s/ba]
- #43:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 3/4 [00:28<00:08,  8.75s/ba]
-Step... (5000 | Loss: 3.704026460647583, Learning Rate: 0.00238410709425807)
-Step... (5000 | Eval Loss: 3.6012017726898193 | Eval Perplexity: 36.64224364155734)
-Step... (10000 | Loss: 3.3888800144195557, Learning Rate: 0.0023642408195883036)
-Step... (10000 | Eval Loss: 3.424499034881592 | Eval Perplexity: 30.707257732613787)
-Step... (15000 | Loss: 3.297701358795166, Learning Rate: 0.002344374777749181)