Update README.md
#2
by
FINGU-AI
- opened
README.md
CHANGED
@@ -594,157 +594,6 @@ You can finetune this model on your own dataset.
|
|
594 |
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
595 |
-->
|
596 |
|
597 |
-
## Training Details
|
598 |
-
|
599 |
-
### Training Hyperparameters
|
600 |
-
#### Non-Default Hyperparameters
|
601 |
-
|
602 |
-
- `eval_strategy`: steps
|
603 |
-
- `per_device_eval_batch_size`: 4
|
604 |
-
- `gradient_accumulation_steps`: 4
|
605 |
-
- `learning_rate`: 2e-05
|
606 |
-
- `lr_scheduler_type`: cosine
|
607 |
-
- `warmup_ratio`: 0.1
|
608 |
-
- `warmup_steps`: 5
|
609 |
-
- `bf16`: True
|
610 |
-
- `tf32`: True
|
611 |
-
- `optim`: adamw_torch_fused
|
612 |
-
- `gradient_checkpointing`: True
|
613 |
-
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
614 |
-
- `batch_sampler`: no_duplicates
|
615 |
-
|
616 |
-
#### All Hyperparameters
|
617 |
-
<details><summary>Click to expand</summary>
|
618 |
-
|
619 |
-
- `overwrite_output_dir`: False
|
620 |
-
- `do_predict`: False
|
621 |
-
- `eval_strategy`: steps
|
622 |
-
- `prediction_loss_only`: True
|
623 |
-
- `per_device_train_batch_size`: 8
|
624 |
-
- `per_device_eval_batch_size`: 4
|
625 |
-
- `per_gpu_train_batch_size`: None
|
626 |
-
- `per_gpu_eval_batch_size`: None
|
627 |
-
- `gradient_accumulation_steps`: 4
|
628 |
-
- `eval_accumulation_steps`: None
|
629 |
-
- `learning_rate`: 2e-05
|
630 |
-
- `weight_decay`: 0.0
|
631 |
-
- `adam_beta1`: 0.9
|
632 |
-
- `adam_beta2`: 0.999
|
633 |
-
- `adam_epsilon`: 1e-08
|
634 |
-
- `max_grad_norm`: 1.0
|
635 |
-
- `num_train_epochs`: 3
|
636 |
-
- `max_steps`: -1
|
637 |
-
- `lr_scheduler_type`: cosine
|
638 |
-
- `lr_scheduler_kwargs`: {}
|
639 |
-
- `warmup_ratio`: 0.1
|
640 |
-
- `warmup_steps`: 5
|
641 |
-
- `log_level`: passive
|
642 |
-
- `log_level_replica`: warning
|
643 |
-
- `log_on_each_node`: True
|
644 |
-
- `logging_nan_inf_filter`: True
|
645 |
-
- `save_safetensors`: True
|
646 |
-
- `save_on_each_node`: False
|
647 |
-
- `save_only_model`: False
|
648 |
-
- `restore_callback_states_from_checkpoint`: False
|
649 |
-
- `no_cuda`: False
|
650 |
-
- `use_cpu`: False
|
651 |
-
- `use_mps_device`: False
|
652 |
-
- `seed`: 42
|
653 |
-
- `data_seed`: None
|
654 |
-
- `jit_mode_eval`: False
|
655 |
-
- `use_ipex`: False
|
656 |
-
- `bf16`: True
|
657 |
-
- `fp16`: False
|
658 |
-
- `fp16_opt_level`: O1
|
659 |
-
- `half_precision_backend`: auto
|
660 |
-
- `bf16_full_eval`: False
|
661 |
-
- `fp16_full_eval`: False
|
662 |
-
- `tf32`: True
|
663 |
-
- `local_rank`: 0
|
664 |
-
- `ddp_backend`: None
|
665 |
-
- `tpu_num_cores`: None
|
666 |
-
- `tpu_metrics_debug`: False
|
667 |
-
- `debug`: []
|
668 |
-
- `dataloader_drop_last`: True
|
669 |
-
- `dataloader_num_workers`: 0
|
670 |
-
- `dataloader_prefetch_factor`: None
|
671 |
-
- `past_index`: -1
|
672 |
-
- `disable_tqdm`: False
|
673 |
-
- `remove_unused_columns`: True
|
674 |
-
- `label_names`: None
|
675 |
-
- `load_best_model_at_end`: False
|
676 |
-
- `ignore_data_skip`: False
|
677 |
-
- `fsdp`: []
|
678 |
-
- `fsdp_min_num_params`: 0
|
679 |
-
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
680 |
-
- `fsdp_transformer_layer_cls_to_wrap`: None
|
681 |
-
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
682 |
-
- `deepspeed`: None
|
683 |
-
- `label_smoothing_factor`: 0.0
|
684 |
-
- `optim`: adamw_torch_fused
|
685 |
-
- `optim_args`: None
|
686 |
-
- `adafactor`: False
|
687 |
-
- `group_by_length`: False
|
688 |
-
- `length_column_name`: length
|
689 |
-
- `ddp_find_unused_parameters`: None
|
690 |
-
- `ddp_bucket_cap_mb`: None
|
691 |
-
- `ddp_broadcast_buffers`: False
|
692 |
-
- `dataloader_pin_memory`: True
|
693 |
-
- `dataloader_persistent_workers`: False
|
694 |
-
- `skip_memory_metrics`: True
|
695 |
-
- `use_legacy_prediction_loop`: False
|
696 |
-
- `push_to_hub`: False
|
697 |
-
- `resume_from_checkpoint`: None
|
698 |
-
- `hub_model_id`: None
|
699 |
-
- `hub_strategy`: every_save
|
700 |
-
- `hub_private_repo`: False
|
701 |
-
- `hub_always_push`: False
|
702 |
-
- `gradient_checkpointing`: True
|
703 |
-
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
704 |
-
- `include_inputs_for_metrics`: False
|
705 |
-
- `eval_do_concat_batches`: True
|
706 |
-
- `fp16_backend`: auto
|
707 |
-
- `push_to_hub_model_id`: None
|
708 |
-
- `push_to_hub_organization`: None
|
709 |
-
- `mp_parameters`:
|
710 |
-
- `auto_find_batch_size`: False
|
711 |
-
- `full_determinism`: False
|
712 |
-
- `torchdynamo`: None
|
713 |
-
- `ray_scope`: last
|
714 |
-
- `ddp_timeout`: 1800
|
715 |
-
- `torch_compile`: False
|
716 |
-
- `torch_compile_backend`: None
|
717 |
-
- `torch_compile_mode`: None
|
718 |
-
- `dispatch_batches`: None
|
719 |
-
- `split_batches`: None
|
720 |
-
- `include_tokens_per_second`: False
|
721 |
-
- `include_num_input_tokens_seen`: False
|
722 |
-
- `neftune_noise_alpha`: None
|
723 |
-
- `optim_target_modules`: None
|
724 |
-
- `batch_eval_metrics`: False
|
725 |
-
- `batch_sampler`: no_duplicates
|
726 |
-
- `multi_dataset_batch_sampler`: proportional
|
727 |
-
|
728 |
-
</details>
|
729 |
-
|
730 |
-
### Training Logs
|
731 |
-
| Epoch | Step | Training Loss | retrival loss | sts loss | reranking loss |
|
732 |
-
|:------:|:----:|:-------------:|:-------------:|:--------:|:--------------:|
|
733 |
-
| 0.5222 | 500 | 0.7949 | 0.0187 | 2.6522 | 0.2931 |
|
734 |
-
| 1.0444 | 1000 | 0.6813 | 0.0139 | 2.5109 | 0.2695 |
|
735 |
-
| 1.5666 | 1500 | 0.5148 | 0.0118 | 2.5270 | 0.2807 |
|
736 |
-
| 2.0888 | 2000 | 0.48 | 0.0114 | 2.5418 | 0.2791 |
|
737 |
-
| 2.6110 | 2500 | 0.3782 | 0.0117 | 2.5740 | 0.2787 |
|
738 |
-
|
739 |
-
|
740 |
-
### Framework Versions
|
741 |
-
- Python: 3.10.12
|
742 |
-
- Sentence Transformers: 3.0.1
|
743 |
-
- Transformers: 4.41.2
|
744 |
-
- PyTorch: 2.2.0+cu121
|
745 |
-
- Accelerate: 0.32.1
|
746 |
-
- Datasets: 2.20.0
|
747 |
-
- Tokenizers: 0.19.1
|
748 |
|
749 |
## Citation
|
750 |
|
|
|
594 |
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
595 |
-->
|
596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
597 |
|
598 |
## Citation
|
599 |
|