DanteAl97 commited on
Commit
d12dced
·
verified ·
1 Parent(s): fe75127

Upload 11 files

Browse files
adapter_config.json CHANGED
@@ -20,9 +20,9 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "up_proj",
24
- "gate_proj",
25
- "down_proj"
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "down_proj",
24
  "up_proj",
25
+ "gate_proj"
 
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac26007027b8d52c4043d59b30948d8dbd10889cfce599e9d660d0ef69a0955e
3
  size 92824216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a79365868e96dbd30865018cc8e0ec6ae00dd78c422c7155b448a0f25ea5785f
3
  size 92824216
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:822328453426ce5294a42e9de1e2da5c10d883af3aaadea6a8f9b0b398b68b12
3
  size 47209298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd62859726d46c7cab01c6b8c28886ac32364304c4811b5be12790acd36300ed
3
  size 47209298
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:401554a9eaccea3cd373f8884b80dcd6b5bc4aa1568304ff22f746b80036436f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46290c18c3b3920166595e51bd3f5c6fc1585185680dcfaaf67a05fc1901e325
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51e00377e40678abd327a228976d2667ba86b24b532e85a929789bde9a194307
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50875594b892047c8e46a61b82a18becc1a9e9c9af6ce75ec5a292af0d5a8cd2
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9434951252751861,
5
  "eval_steps": 100,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -457,14 +457,314 @@
457
  "eval_samples_per_second": 49.886,
458
  "eval_steps_per_second": 1.559,
459
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  }
461
  ],
462
  "logging_steps": 100,
463
- "max_steps": 3000,
464
  "num_input_tokens_seen": 0,
465
- "num_train_epochs": 1,
466
  "save_steps": 50,
467
- "total_flos": 1.2248172174916977e+18,
468
  "train_batch_size": 32,
469
  "trial_name": null,
470
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.572701541042038,
5
  "eval_steps": 100,
6
+ "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
457
  "eval_samples_per_second": 49.886,
458
  "eval_steps_per_second": 1.559,
459
  "step": 3000
460
+ },
461
+ {
462
+ "epoch": 0.97,
463
+ "grad_norm": 0.7738541960716248,
464
+ "learning_rate": 3.8775510204081634e-05,
465
+ "loss": 1.7029,
466
+ "step": 3100
467
+ },
468
+ {
469
+ "epoch": 0.97,
470
+ "eval_loss": 1.7050038576126099,
471
+ "eval_runtime": 687.1983,
472
+ "eval_samples_per_second": 49.353,
473
+ "eval_steps_per_second": 1.542,
474
+ "step": 3100
475
+ },
476
+ {
477
+ "epoch": 1.01,
478
+ "grad_norm": 0.6895627379417419,
479
+ "learning_rate": 3.673469387755102e-05,
480
+ "loss": 1.7094,
481
+ "step": 3200
482
+ },
483
+ {
484
+ "epoch": 1.01,
485
+ "eval_loss": 1.7042526006698608,
486
+ "eval_runtime": 687.1845,
487
+ "eval_samples_per_second": 49.354,
488
+ "eval_steps_per_second": 1.543,
489
+ "step": 3200
490
+ },
491
+ {
492
+ "epoch": 1.04,
493
+ "grad_norm": 0.8041057586669922,
494
+ "learning_rate": 3.469387755102041e-05,
495
+ "loss": 1.7049,
496
+ "step": 3300
497
+ },
498
+ {
499
+ "epoch": 1.04,
500
+ "eval_loss": 1.7035516500473022,
501
+ "eval_runtime": 687.4035,
502
+ "eval_samples_per_second": 49.338,
503
+ "eval_steps_per_second": 1.542,
504
+ "step": 3300
505
+ },
506
+ {
507
+ "epoch": 1.07,
508
+ "grad_norm": 0.7259939908981323,
509
+ "learning_rate": 3.265306122448979e-05,
510
+ "loss": 1.7098,
511
+ "step": 3400
512
+ },
513
+ {
514
+ "epoch": 1.07,
515
+ "eval_loss": 1.7024834156036377,
516
+ "eval_runtime": 687.1647,
517
+ "eval_samples_per_second": 49.355,
518
+ "eval_steps_per_second": 1.543,
519
+ "step": 3400
520
+ },
521
+ {
522
+ "epoch": 1.1,
523
+ "grad_norm": 0.7912746667861938,
524
+ "learning_rate": 3.061224489795919e-05,
525
+ "loss": 1.7015,
526
+ "step": 3500
527
+ },
528
+ {
529
+ "epoch": 1.1,
530
+ "eval_loss": 1.7005605697631836,
531
+ "eval_runtime": 687.1366,
532
+ "eval_samples_per_second": 49.357,
533
+ "eval_steps_per_second": 1.543,
534
+ "step": 3500
535
+ },
536
+ {
537
+ "epoch": 1.13,
538
+ "grad_norm": 0.8287527561187744,
539
+ "learning_rate": 2.857142857142857e-05,
540
+ "loss": 1.6876,
541
+ "step": 3600
542
+ },
543
+ {
544
+ "epoch": 1.13,
545
+ "eval_loss": 1.6933950185775757,
546
+ "eval_runtime": 683.5096,
547
+ "eval_samples_per_second": 49.619,
548
+ "eval_steps_per_second": 1.551,
549
+ "step": 3600
550
+ },
551
+ {
552
+ "epoch": 1.16,
553
+ "grad_norm": 0.736217737197876,
554
+ "learning_rate": 2.6530612244897963e-05,
555
+ "loss": 1.6958,
556
+ "step": 3700
557
+ },
558
+ {
559
+ "epoch": 1.16,
560
+ "eval_loss": 1.692893624305725,
561
+ "eval_runtime": 683.433,
562
+ "eval_samples_per_second": 49.624,
563
+ "eval_steps_per_second": 1.551,
564
+ "step": 3700
565
+ },
566
+ {
567
+ "epoch": 1.2,
568
+ "grad_norm": 0.7109358906745911,
569
+ "learning_rate": 2.448979591836735e-05,
570
+ "loss": 1.6885,
571
+ "step": 3800
572
+ },
573
+ {
574
+ "epoch": 1.2,
575
+ "eval_loss": 1.6916097402572632,
576
+ "eval_runtime": 683.3969,
577
+ "eval_samples_per_second": 49.627,
578
+ "eval_steps_per_second": 1.551,
579
+ "step": 3800
580
+ },
581
+ {
582
+ "epoch": 1.23,
583
+ "grad_norm": 0.7234348654747009,
584
+ "learning_rate": 2.2448979591836737e-05,
585
+ "loss": 1.6934,
586
+ "step": 3900
587
+ },
588
+ {
589
+ "epoch": 1.23,
590
+ "eval_loss": 1.6902754306793213,
591
+ "eval_runtime": 683.1628,
592
+ "eval_samples_per_second": 49.644,
593
+ "eval_steps_per_second": 1.552,
594
+ "step": 3900
595
+ },
596
+ {
597
+ "epoch": 1.26,
598
+ "grad_norm": 0.7684239149093628,
599
+ "learning_rate": 2.0408163265306123e-05,
600
+ "loss": 1.6909,
601
+ "step": 4000
602
+ },
603
+ {
604
+ "epoch": 1.26,
605
+ "eval_loss": 1.689305067062378,
606
+ "eval_runtime": 683.1661,
607
+ "eval_samples_per_second": 49.644,
608
+ "eval_steps_per_second": 1.552,
609
+ "step": 4000
610
+ },
611
+ {
612
+ "epoch": 1.29,
613
+ "grad_norm": 0.7669008374214172,
614
+ "learning_rate": 1.836734693877551e-05,
615
+ "loss": 1.6907,
616
+ "step": 4100
617
+ },
618
+ {
619
+ "epoch": 1.29,
620
+ "eval_loss": 1.688330888748169,
621
+ "eval_runtime": 683.1804,
622
+ "eval_samples_per_second": 49.643,
623
+ "eval_steps_per_second": 1.552,
624
+ "step": 4100
625
+ },
626
+ {
627
+ "epoch": 1.32,
628
+ "grad_norm": 0.7422395348548889,
629
+ "learning_rate": 1.6326530612244897e-05,
630
+ "loss": 1.6912,
631
+ "step": 4200
632
+ },
633
+ {
634
+ "epoch": 1.32,
635
+ "eval_loss": 1.687252163887024,
636
+ "eval_runtime": 683.2129,
637
+ "eval_samples_per_second": 49.64,
638
+ "eval_steps_per_second": 1.551,
639
+ "step": 4200
640
+ },
641
+ {
642
+ "epoch": 1.35,
643
+ "grad_norm": 0.7352548837661743,
644
+ "learning_rate": 1.4285714285714285e-05,
645
+ "loss": 1.6873,
646
+ "step": 4300
647
+ },
648
+ {
649
+ "epoch": 1.35,
650
+ "eval_loss": 1.6862083673477173,
651
+ "eval_runtime": 683.1788,
652
+ "eval_samples_per_second": 49.643,
653
+ "eval_steps_per_second": 1.552,
654
+ "step": 4300
655
+ },
656
+ {
657
+ "epoch": 1.38,
658
+ "grad_norm": 0.7130007147789001,
659
+ "learning_rate": 1.2244897959183674e-05,
660
+ "loss": 1.6858,
661
+ "step": 4400
662
+ },
663
+ {
664
+ "epoch": 1.38,
665
+ "eval_loss": 1.6853961944580078,
666
+ "eval_runtime": 683.1786,
667
+ "eval_samples_per_second": 49.643,
668
+ "eval_steps_per_second": 1.552,
669
+ "step": 4400
670
+ },
671
+ {
672
+ "epoch": 1.42,
673
+ "grad_norm": 0.7947734594345093,
674
+ "learning_rate": 1.0204081632653061e-05,
675
+ "loss": 1.6813,
676
+ "step": 4500
677
+ },
678
+ {
679
+ "epoch": 1.42,
680
+ "eval_loss": 1.6845451593399048,
681
+ "eval_runtime": 683.4171,
682
+ "eval_samples_per_second": 49.626,
683
+ "eval_steps_per_second": 1.551,
684
+ "step": 4500
685
+ },
686
+ {
687
+ "epoch": 1.45,
688
+ "grad_norm": 0.7227717041969299,
689
+ "learning_rate": 8.163265306122448e-06,
690
+ "loss": 1.6867,
691
+ "step": 4600
692
+ },
693
+ {
694
+ "epoch": 1.45,
695
+ "eval_loss": 1.6836014986038208,
696
+ "eval_runtime": 683.3718,
697
+ "eval_samples_per_second": 49.629,
698
+ "eval_steps_per_second": 1.551,
699
+ "step": 4600
700
+ },
701
+ {
702
+ "epoch": 1.48,
703
+ "grad_norm": 0.746582567691803,
704
+ "learning_rate": 6.122448979591837e-06,
705
+ "loss": 1.6882,
706
+ "step": 4700
707
+ },
708
+ {
709
+ "epoch": 1.48,
710
+ "eval_loss": 1.682924509048462,
711
+ "eval_runtime": 683.3662,
712
+ "eval_samples_per_second": 49.629,
713
+ "eval_steps_per_second": 1.551,
714
+ "step": 4700
715
+ },
716
+ {
717
+ "epoch": 1.51,
718
+ "grad_norm": 0.7279271483421326,
719
+ "learning_rate": 4.081632653061224e-06,
720
+ "loss": 1.6872,
721
+ "step": 4800
722
+ },
723
+ {
724
+ "epoch": 1.51,
725
+ "eval_loss": 1.682388186454773,
726
+ "eval_runtime": 683.1514,
727
+ "eval_samples_per_second": 49.645,
728
+ "eval_steps_per_second": 1.552,
729
+ "step": 4800
730
+ },
731
+ {
732
+ "epoch": 1.54,
733
+ "grad_norm": 0.7303986549377441,
734
+ "learning_rate": 2.040816326530612e-06,
735
+ "loss": 1.6898,
736
+ "step": 4900
737
+ },
738
+ {
739
+ "epoch": 1.54,
740
+ "eval_loss": 1.682073950767517,
741
+ "eval_runtime": 683.3608,
742
+ "eval_samples_per_second": 49.63,
743
+ "eval_steps_per_second": 1.551,
744
+ "step": 4900
745
+ },
746
+ {
747
+ "epoch": 1.57,
748
+ "grad_norm": 0.763130784034729,
749
+ "learning_rate": 0.0,
750
+ "loss": 1.6845,
751
+ "step": 5000
752
+ },
753
+ {
754
+ "epoch": 1.57,
755
+ "eval_loss": 1.6819010972976685,
756
+ "eval_runtime": 683.1778,
757
+ "eval_samples_per_second": 49.643,
758
+ "eval_steps_per_second": 1.552,
759
+ "step": 5000
760
  }
761
  ],
762
  "logging_steps": 100,
763
+ "max_steps": 5000,
764
  "num_input_tokens_seen": 0,
765
+ "num_train_epochs": 2,
766
  "save_steps": 50,
767
+ "total_flos": 2.03873794720034e+18,
768
  "train_batch_size": 32,
769
  "trial_name": null,
770
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91cbf40c0e12bca661a194728df5101c081c9dd12fbdbb4d794a75d0fe883d50
3
  size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc19869896809aeeb998276dffc5c796ca8ce160e5c5b0e576340e801ae8fa90
3
  size 4856