just1nseo commited on
Commit
af3f8c8
·
verified ·
1 Parent(s): 61362ef

Model save

Browse files
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: alignment-handbook/zephyr-7b-sft-full
9
+ model-index:
10
+ - name: zephyr-dpop-qlora-gpt4-5e-6-epoch3
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # zephyr-dpop-qlora-gpt4-5e-6-epoch3
18
+
19
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 14.5955
22
+ - Positive Losses: 143.9389
23
+ - Dpo Losses: 0.6869
24
+ - Rewards/chosen: -1.4291
25
+ - Rewards/rejected: -2.0251
26
+ - Rewards/accuracies: 0.6627
27
+ - Rewards/margins: 0.5959
28
+ - Rewards/margins Max: 2.2953
29
+ - Rewards/margins Min: -1.1073
30
+ - Rewards/margins Std: 1.5052
31
+ - Logps/rejected: -461.6887
32
+ - Logps/chosen: -428.1342
33
+ - Logits/rejected: -2.2738
34
+ - Logits/chosen: -2.3165
35
+
36
+ ## Model description
37
+
38
+ More information needed
39
+
40
+ ## Intended uses & limitations
41
+
42
+ More information needed
43
+
44
+ ## Training and evaluation data
45
+
46
+ More information needed
47
+
48
+ ## Training procedure
49
+
50
+ ### Training hyperparameters
51
+
52
+ The following hyperparameters were used during training:
53
+ - learning_rate: 5e-06
54
+ - train_batch_size: 2
55
+ - eval_batch_size: 4
56
+ - seed: 42
57
+ - distributed_type: multi-GPU
58
+ - num_devices: 8
59
+ - total_train_batch_size: 16
60
+ - total_eval_batch_size: 32
61
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
+ - lr_scheduler_type: cosine
63
+ - lr_scheduler_warmup_ratio: 0.1
64
+ - num_epochs: 3
65
+
66
+ ### Training results
67
+
68
+ | Training Loss | Epoch | Step | Validation Loss | Positive Losses | Dpo Losses | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Rewards/margins Max | Rewards/margins Min | Rewards/margins Std | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
69
+ |:-------------:|:-----:|:----:|:---------------:|:---------------:|:----------:|:--------------:|:----------------:|:------------------:|:---------------:|:-------------------:|:-------------------:|:-------------------:|:--------------:|:------------:|:---------------:|:-------------:|
70
+ | 0.5432 | 0.28 | 100 | 1.5490 | 8.3683 | 0.6723 | -0.0507 | -0.1015 | 0.5992 | 0.0508 | 0.2567 | -0.1414 | 0.1757 | -269.3354 | -290.2917 | -2.6677 | -2.7099 |
71
+ | 0.4843 | 0.56 | 200 | 3.6354 | 28.9322 | 0.6415 | -0.2537 | -0.4297 | 0.6349 | 0.1759 | 0.7364 | -0.3533 | 0.4858 | -302.1486 | -310.5943 | -2.5589 | -2.6000 |
72
+ | 0.2828 | 0.85 | 300 | 6.8046 | 61.7689 | 0.6346 | -0.6003 | -0.8503 | 0.6508 | 0.2500 | 1.0085 | -0.4868 | 0.6679 | -344.2117 | -345.2526 | -2.5349 | -2.5759 |
73
+ | 0.3355 | 1.13 | 400 | 11.4158 | 108.7399 | 0.6572 | -1.0761 | -1.4209 | 0.6548 | 0.3447 | 1.4626 | -0.7661 | 0.9968 | -401.2702 | -392.8341 | -2.3773 | -2.4155 |
74
+ | 0.3438 | 1.41 | 500 | 10.6413 | 101.3525 | 0.6381 | -1.0007 | -1.3406 | 0.6865 | 0.3399 | 1.3353 | -0.6338 | 0.8805 | -393.2457 | -385.2938 | -2.4471 | -2.4907 |
75
+ | 0.2144 | 1.69 | 600 | 8.5896 | 79.7998 | 0.6267 | -0.7817 | -1.2135 | 0.6865 | 0.4318 | 1.5951 | -0.6661 | 1.0047 | -380.5318 | -363.3914 | -2.3029 | -2.3438 |
76
+ | 0.3314 | 1.97 | 700 | 11.1651 | 107.2969 | 0.6525 | -1.0595 | -1.5150 | 0.6627 | 0.4555 | 1.7776 | -0.8450 | 1.1660 | -410.6869 | -391.1705 | -2.3025 | -2.3432 |
77
+ | 0.1352 | 2.25 | 800 | 13.3571 | 130.9070 | 0.6700 | -1.2986 | -1.8184 | 0.6627 | 0.5198 | 2.0225 | -0.9603 | 1.3296 | -441.0237 | -415.0786 | -2.2901 | -2.3320 |
78
+ | 0.2348 | 2.54 | 900 | 14.7241 | 145.9081 | 0.6904 | -1.4488 | -2.0053 | 0.6706 | 0.5564 | 2.1801 | -1.0958 | 1.4586 | -459.7108 | -430.1044 | -2.2661 | -2.3085 |
79
+ | 0.1369 | 2.82 | 1000 | 14.5955 | 143.9389 | 0.6869 | -1.4291 | -2.0251 | 0.6627 | 0.5959 | 2.2953 | -1.1073 | 1.5052 | -461.6887 | -428.1342 | -2.2738 | -2.3165 |
80
+
81
+
82
+ ### Framework versions
83
+
84
+ - PEFT 0.7.1
85
+ - Transformers 4.39.0.dev0
86
+ - Pytorch 2.1.2+cu121
87
+ - Datasets 2.14.6
88
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbb877455cebb3532026fabfa662987f964138b9d37c0976ba1241b56952232f
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8319769653c10d99f3730b985835d27d1eedcee4f092d2dbb20aa1d26dff630d
3
  size 671150064
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.36563416943303856,
4
+ "train_runtime": 9271.2095,
5
+ "train_samples": 5678,
6
+ "train_samples_per_second": 1.837,
7
+ "train_steps_per_second": 0.115
8
+ }
runs/Jul29_11-59-46_node13/events.out.tfevents.1722222409.node13.208681.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d22aef219683c027b344767e79eaadbd0c38ff3abb24899d1a450ebf28199bf4
3
- size 116014
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84877456a01902e1a32a6ffb2fe7d6f9d9d67f416aa44cecafb80e357465a873
3
+ size 122338
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.36563416943303856,
4
+ "train_runtime": 9271.2095,
5
+ "train_samples": 5678,
6
+ "train_samples_per_second": 1.837,
7
+ "train_steps_per_second": 0.115
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1065,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "dpo_losses": 0.6931471824645996,
13
+ "epoch": 0.0,
14
+ "grad_norm": 2.1182166269339366,
15
+ "learning_rate": 4.672897196261682e-08,
16
+ "logits/chosen": -2.8477635383605957,
17
+ "logits/rejected": -2.8469698429107666,
18
+ "logps/chosen": -522.6112670898438,
19
+ "logps/rejected": -359.48583984375,
20
+ "loss": 0.6931,
21
+ "positive_losses": 0.0,
22
+ "rewards/accuracies": 0.0,
23
+ "rewards/chosen": 0.0,
24
+ "rewards/margins": 0.0,
25
+ "rewards/margins_max": 0.0,
26
+ "rewards/margins_min": 0.0,
27
+ "rewards/margins_std": 0.0,
28
+ "rewards/rejected": 0.0,
29
+ "step": 1
30
+ },
31
+ {
32
+ "dpo_losses": 0.6931488513946533,
33
+ "epoch": 0.03,
34
+ "grad_norm": 18.90606759984328,
35
+ "learning_rate": 4.6728971962616824e-07,
36
+ "logits/chosen": -2.921452522277832,
37
+ "logits/rejected": -2.7972779273986816,
38
+ "logps/chosen": -313.4413146972656,
39
+ "logps/rejected": -170.33502197265625,
40
+ "loss": 0.698,
41
+ "positive_losses": 0.056756019592285156,
42
+ "rewards/accuracies": 0.5555555820465088,
43
+ "rewards/chosen": 0.00029136601369827986,
44
+ "rewards/margins": -2.1007015220675385e-06,
45
+ "rewards/margins_max": 0.0011362881632521749,
46
+ "rewards/margins_min": -0.001140489592216909,
47
+ "rewards/margins_std": 0.0016099249478429556,
48
+ "rewards/rejected": 0.0002934667863883078,
49
+ "step": 10
50
+ },
51
+ {
52
+ "dpo_losses": 0.6920409202575684,
53
+ "epoch": 0.06,
54
+ "grad_norm": 2.048573611790903,
55
+ "learning_rate": 9.345794392523365e-07,
56
+ "logits/chosen": -2.7631053924560547,
57
+ "logits/rejected": -2.7107467651367188,
58
+ "logps/chosen": -380.45953369140625,
59
+ "logps/rejected": -243.90365600585938,
60
+ "loss": 0.6928,
61
+ "positive_losses": 0.02124938927590847,
62
+ "rewards/accuracies": 0.6499999761581421,
63
+ "rewards/chosen": 0.006503595970571041,
64
+ "rewards/margins": 0.0022183754481375217,
65
+ "rewards/margins_max": 0.004751545377075672,
66
+ "rewards/margins_min": -0.00031479448080062866,
67
+ "rewards/margins_std": 0.0035824428778141737,
68
+ "rewards/rejected": 0.004285220988094807,
69
+ "step": 20
70
+ },
71
+ {
72
+ "dpo_losses": 0.6899846792221069,
73
+ "epoch": 0.08,
74
+ "grad_norm": 2.2814863944700297,
75
+ "learning_rate": 1.4018691588785047e-06,
76
+ "logits/chosen": -2.87530779838562,
77
+ "logits/rejected": -2.823071002960205,
78
+ "logps/chosen": -374.14593505859375,
79
+ "logps/rejected": -251.3402099609375,
80
+ "loss": 0.6884,
81
+ "positive_losses": 0.0,
82
+ "rewards/accuracies": 0.699999988079071,
83
+ "rewards/chosen": 0.01957734487950802,
84
+ "rewards/margins": 0.0063508180901408195,
85
+ "rewards/margins_max": 0.011502384208142757,
86
+ "rewards/margins_min": 0.0011992522049695253,
87
+ "rewards/margins_std": 0.007285414729267359,
88
+ "rewards/rejected": 0.013226528652012348,
89
+ "step": 30
90
+ },
91
+ {
92
+ "dpo_losses": 0.6816079616546631,
93
+ "epoch": 0.11,
94
+ "grad_norm": 1.9338584641637362,
95
+ "learning_rate": 1.869158878504673e-06,
96
+ "logits/chosen": -2.732996940612793,
97
+ "logits/rejected": -2.7668612003326416,
98
+ "logps/chosen": -302.9008483886719,
99
+ "logps/rejected": -315.843505859375,
100
+ "loss": 0.6807,
101
+ "positive_losses": 0.0,
102
+ "rewards/accuracies": 0.8500000238418579,
103
+ "rewards/chosen": 0.04024823382496834,
104
+ "rewards/margins": 0.023371612653136253,
105
+ "rewards/margins_max": 0.03446139022707939,
106
+ "rewards/margins_min": 0.01228183414787054,
107
+ "rewards/margins_std": 0.015683313831686974,
108
+ "rewards/rejected": 0.016876617446541786,
109
+ "step": 40
110
+ },
111
+ {
112
+ "dpo_losses": 0.6704033613204956,
113
+ "epoch": 0.14,
114
+ "grad_norm": 2.241837849751868,
115
+ "learning_rate": 2.3364485981308413e-06,
116
+ "logits/chosen": -2.7844934463500977,
117
+ "logits/rejected": -2.696681022644043,
118
+ "logps/chosen": -238.6322784423828,
119
+ "logps/rejected": -172.29949951171875,
120
+ "loss": 0.6657,
121
+ "positive_losses": 0.0,
122
+ "rewards/accuracies": 0.8999999761581421,
123
+ "rewards/chosen": 0.06590737402439117,
124
+ "rewards/margins": 0.046577345579862595,
125
+ "rewards/margins_max": 0.06979880481958389,
126
+ "rewards/margins_min": 0.023355895653367043,
127
+ "rewards/margins_std": 0.03284009173512459,
128
+ "rewards/rejected": 0.019330020993947983,
129
+ "step": 50
130
+ },
131
+ {
132
+ "dpo_losses": 0.6440416574478149,
133
+ "epoch": 0.17,
134
+ "grad_norm": 2.210137428049826,
135
+ "learning_rate": 2.8037383177570094e-06,
136
+ "logits/chosen": -2.7359554767608643,
137
+ "logits/rejected": -2.6721925735473633,
138
+ "logps/chosen": -255.8380584716797,
139
+ "logps/rejected": -234.24276733398438,
140
+ "loss": 0.6442,
141
+ "positive_losses": 0.0,
142
+ "rewards/accuracies": 0.949999988079071,
143
+ "rewards/chosen": 0.11597372591495514,
144
+ "rewards/margins": 0.10343287885189056,
145
+ "rewards/margins_max": 0.15465359389781952,
146
+ "rewards/margins_min": 0.0522121861577034,
147
+ "rewards/margins_std": 0.07243702560663223,
148
+ "rewards/rejected": 0.012540824711322784,
149
+ "step": 60
150
+ },
151
+ {
152
+ "dpo_losses": 0.6189829111099243,
153
+ "epoch": 0.2,
154
+ "grad_norm": 1.721316554071705,
155
+ "learning_rate": 3.2710280373831774e-06,
156
+ "logits/chosen": -2.6506264209747314,
157
+ "logits/rejected": -2.65425443649292,
158
+ "logps/chosen": -316.86090087890625,
159
+ "logps/rejected": -213.46066284179688,
160
+ "loss": 0.6213,
161
+ "positive_losses": 0.0,
162
+ "rewards/accuracies": 1.0,
163
+ "rewards/chosen": 0.17079836130142212,
164
+ "rewards/margins": 0.15738530457019806,
165
+ "rewards/margins_max": 0.23612920939922333,
166
+ "rewards/margins_min": 0.078641377389431,
167
+ "rewards/margins_std": 0.11136071383953094,
168
+ "rewards/rejected": 0.013413062319159508,
169
+ "step": 70
170
+ },
171
+ {
172
+ "dpo_losses": 0.59584641456604,
173
+ "epoch": 0.23,
174
+ "grad_norm": 5.833494928937326,
175
+ "learning_rate": 3.738317757009346e-06,
176
+ "logits/chosen": -2.8657782077789307,
177
+ "logits/rejected": -2.7893776893615723,
178
+ "logps/chosen": -318.1582946777344,
179
+ "logps/rejected": -289.5111999511719,
180
+ "loss": 0.6052,
181
+ "positive_losses": 0.0,
182
+ "rewards/accuracies": 1.0,
183
+ "rewards/chosen": 0.18891094624996185,
184
+ "rewards/margins": 0.2127668410539627,
185
+ "rewards/margins_max": 0.30381911993026733,
186
+ "rewards/margins_min": 0.12171456962823868,
187
+ "rewards/margins_std": 0.12876734137535095,
188
+ "rewards/rejected": -0.023855898529291153,
189
+ "step": 80
190
+ },
191
+ {
192
+ "dpo_losses": 0.5957463383674622,
193
+ "epoch": 0.25,
194
+ "grad_norm": 2.2476761791524478,
195
+ "learning_rate": 4.205607476635514e-06,
196
+ "logits/chosen": -2.668989658355713,
197
+ "logits/rejected": -2.6688218116760254,
198
+ "logps/chosen": -256.0438537597656,
199
+ "logps/rejected": -203.91128540039062,
200
+ "loss": 0.5735,
201
+ "positive_losses": 0.0,
202
+ "rewards/accuracies": 0.949999988079071,
203
+ "rewards/chosen": 0.20931395888328552,
204
+ "rewards/margins": 0.21628114581108093,
205
+ "rewards/margins_max": 0.35886088013648987,
206
+ "rewards/margins_min": 0.07370143383741379,
207
+ "rewards/margins_std": 0.20163817703723907,
208
+ "rewards/rejected": -0.00696719903498888,
209
+ "step": 90
210
+ },
211
+ {
212
+ "dpo_losses": 0.5217684507369995,
213
+ "epoch": 0.28,
214
+ "grad_norm": 1.9962825797807737,
215
+ "learning_rate": 4.6728971962616825e-06,
216
+ "logits/chosen": -2.8356316089630127,
217
+ "logits/rejected": -2.7704596519470215,
218
+ "logps/chosen": -424.61712646484375,
219
+ "logps/rejected": -347.96246337890625,
220
+ "loss": 0.5432,
221
+ "positive_losses": 0.040345001965761185,
222
+ "rewards/accuracies": 1.0,
223
+ "rewards/chosen": 0.336413711309433,
224
+ "rewards/margins": 0.3945561945438385,
225
+ "rewards/margins_max": 0.5599964261054993,
226
+ "rewards/margins_min": 0.22911591827869415,
227
+ "rewards/margins_std": 0.2339678257703781,
228
+ "rewards/rejected": -0.058142442256212234,
229
+ "step": 100
230
+ },
231
+ {
232
+ "epoch": 0.28,
233
+ "eval_dpo_losses": 0.6723248958587646,
234
+ "eval_logits/chosen": -2.7099392414093018,
235
+ "eval_logits/rejected": -2.6677489280700684,
236
+ "eval_logps/chosen": -290.291748046875,
237
+ "eval_logps/rejected": -269.3353576660156,
238
+ "eval_loss": 1.5490143299102783,
239
+ "eval_positive_losses": 8.368339538574219,
240
+ "eval_rewards/accuracies": 0.5992063283920288,
241
+ "eval_rewards/chosen": -0.05070570856332779,
242
+ "eval_rewards/margins": 0.05082136392593384,
243
+ "eval_rewards/margins_max": 0.25667881965637207,
244
+ "eval_rewards/margins_min": -0.14138472080230713,
245
+ "eval_rewards/margins_std": 0.17566871643066406,
246
+ "eval_rewards/rejected": -0.10152707248926163,
247
+ "eval_runtime": 282.2019,
248
+ "eval_samples_per_second": 7.087,
249
+ "eval_steps_per_second": 0.223,
250
+ "step": 100
251
+ },
252
+ {
253
+ "dpo_losses": 0.5300568342208862,
254
+ "epoch": 0.31,
255
+ "grad_norm": 2.4142164656542064,
256
+ "learning_rate": 4.999879018839288e-06,
257
+ "logits/chosen": -2.694153070449829,
258
+ "logits/rejected": -2.5792508125305176,
259
+ "logps/chosen": -301.54388427734375,
260
+ "logps/rejected": -258.5564270019531,
261
+ "loss": 0.5372,
262
+ "positive_losses": 0.238017275929451,
263
+ "rewards/accuracies": 1.0,
264
+ "rewards/chosen": 0.26155275106430054,
265
+ "rewards/margins": 0.3829374313354492,
266
+ "rewards/margins_max": 0.5613982677459717,
267
+ "rewards/margins_min": 0.20447655022144318,
268
+ "rewards/margins_std": 0.2523817718029022,
269
+ "rewards/rejected": -0.12138471752405167,
270
+ "step": 110
271
+ },
272
+ {
273
+ "dpo_losses": 0.488577663898468,
274
+ "epoch": 0.34,
275
+ "grad_norm": 9.3268800860026,
276
+ "learning_rate": 4.99772856836941e-06,
277
+ "logits/chosen": -2.8129353523254395,
278
+ "logits/rejected": -2.79107403755188,
279
+ "logps/chosen": -337.3607177734375,
280
+ "logps/rejected": -347.9256286621094,
281
+ "loss": 0.5066,
282
+ "positive_losses": 0.0004665374872274697,
283
+ "rewards/accuracies": 1.0,
284
+ "rewards/chosen": 0.345800518989563,
285
+ "rewards/margins": 0.5089500546455383,
286
+ "rewards/margins_max": 0.725378692150116,
287
+ "rewards/margins_min": 0.29252126812934875,
288
+ "rewards/margins_std": 0.3060764968395233,
289
+ "rewards/rejected": -0.16314946115016937,
290
+ "step": 120
291
+ },
292
+ {
293
+ "dpo_losses": 0.42630523443222046,
294
+ "epoch": 0.37,
295
+ "grad_norm": 6.838233431406268,
296
+ "learning_rate": 4.992892309373227e-06,
297
+ "logits/chosen": -2.64375376701355,
298
+ "logits/rejected": -2.5765233039855957,
299
+ "logps/chosen": -352.48126220703125,
300
+ "logps/rejected": -287.73388671875,
301
+ "loss": 0.4592,
302
+ "positive_losses": 0.0,
303
+ "rewards/accuracies": 1.0,
304
+ "rewards/chosen": 0.3849102854728699,
305
+ "rewards/margins": 0.679017186164856,
306
+ "rewards/margins_max": 0.8494507670402527,
307
+ "rewards/margins_min": 0.5085835456848145,
308
+ "rewards/margins_std": 0.24102959036827087,
309
+ "rewards/rejected": -0.2941069006919861,
310
+ "step": 130
311
+ },
312
+ {
313
+ "dpo_losses": 0.4316105842590332,
314
+ "epoch": 0.39,
315
+ "grad_norm": 24.291983530934388,
316
+ "learning_rate": 4.985375442281969e-06,
317
+ "logits/chosen": -2.6181836128234863,
318
+ "logits/rejected": -2.5858638286590576,
319
+ "logps/chosen": -320.45770263671875,
320
+ "logps/rejected": -268.80950927734375,
321
+ "loss": 0.4791,
322
+ "positive_losses": 0.11783752590417862,
323
+ "rewards/accuracies": 0.949999988079071,
324
+ "rewards/chosen": 0.3075354993343353,
325
+ "rewards/margins": 0.6785004138946533,
326
+ "rewards/margins_max": 0.9275323748588562,
327
+ "rewards/margins_min": 0.42946839332580566,
328
+ "rewards/margins_std": 0.35218438506126404,
329
+ "rewards/rejected": -0.37096482515335083,
330
+ "step": 140
331
+ },
332
+ {
333
+ "dpo_losses": 0.43038830161094666,
334
+ "epoch": 0.42,
335
+ "grad_norm": 16.31539388542976,
336
+ "learning_rate": 4.9751860499858175e-06,
337
+ "logits/chosen": -2.6524252891540527,
338
+ "logits/rejected": -2.582724094390869,
339
+ "logps/chosen": -269.40289306640625,
340
+ "logps/rejected": -286.25921630859375,
341
+ "loss": 0.4405,
342
+ "positive_losses": 0.45142823457717896,
343
+ "rewards/accuracies": 1.0,
344
+ "rewards/chosen": 0.27419334650039673,
345
+ "rewards/margins": 0.6590893268585205,
346
+ "rewards/margins_max": 0.8764106631278992,
347
+ "rewards/margins_min": 0.44176802039146423,
348
+ "rewards/margins_std": 0.307338684797287,
349
+ "rewards/rejected": -0.3848959803581238,
350
+ "step": 150
351
+ },
352
+ {
353
+ "dpo_losses": 0.40865588188171387,
354
+ "epoch": 0.45,
355
+ "grad_norm": 9.196914105554178,
356
+ "learning_rate": 4.962335089142376e-06,
357
+ "logits/chosen": -2.671811580657959,
358
+ "logits/rejected": -2.557887554168701,
359
+ "logps/chosen": -298.1056823730469,
360
+ "logps/rejected": -286.0025329589844,
361
+ "loss": 0.4273,
362
+ "positive_losses": 1.3531090021133423,
363
+ "rewards/accuracies": 1.0,
364
+ "rewards/chosen": 0.3361397683620453,
365
+ "rewards/margins": 0.7648395299911499,
366
+ "rewards/margins_max": 1.1010096073150635,
367
+ "rewards/margins_min": 0.42866945266723633,
368
+ "rewards/margins_std": 0.47541624307632446,
369
+ "rewards/rejected": -0.4286997318267822,
370
+ "step": 160
371
+ },
372
+ {
373
+ "dpo_losses": 0.36792081594467163,
374
+ "epoch": 0.48,
375
+ "grad_norm": 3.4842547066626572,
376
+ "learning_rate": 4.946836378394967e-06,
377
+ "logits/chosen": -2.719280242919922,
378
+ "logits/rejected": -2.560034990310669,
379
+ "logps/chosen": -358.5220642089844,
380
+ "logps/rejected": -287.3617858886719,
381
+ "loss": 0.4612,
382
+ "positive_losses": 0.0,
383
+ "rewards/accuracies": 1.0,
384
+ "rewards/chosen": 0.4195915162563324,
385
+ "rewards/margins": 0.9577536582946777,
386
+ "rewards/margins_max": 1.3000526428222656,
387
+ "rewards/margins_min": 0.6154545545578003,
388
+ "rewards/margins_std": 0.48408395051956177,
389
+ "rewards/rejected": -0.5381620526313782,
390
+ "step": 170
391
+ },
392
+ {
393
+ "dpo_losses": 0.3783532977104187,
394
+ "epoch": 0.51,
395
+ "grad_norm": 2.2304162655616295,
396
+ "learning_rate": 4.928706583513441e-06,
397
+ "logits/chosen": -2.599997043609619,
398
+ "logits/rejected": -2.5286357402801514,
399
+ "logps/chosen": -297.5575866699219,
400
+ "logps/rejected": -439.0782165527344,
401
+ "loss": 0.4178,
402
+ "positive_losses": 0.7186470031738281,
403
+ "rewards/accuracies": 1.0,
404
+ "rewards/chosen": 0.26578420400619507,
405
+ "rewards/margins": 0.8594935536384583,
406
+ "rewards/margins_max": 1.1510531902313232,
407
+ "rewards/margins_min": 0.5679339170455933,
408
+ "rewards/margins_std": 0.4123275876045227,
409
+ "rewards/rejected": -0.5937093496322632,
410
+ "step": 180
411
+ },
412
+ {
413
+ "dpo_losses": 0.3120216727256775,
414
+ "epoch": 0.54,
415
+ "grad_norm": 2.502842717440193,
416
+ "learning_rate": 4.907965199473471e-06,
417
+ "logits/chosen": -2.5457229614257812,
418
+ "logits/rejected": -2.3784427642822266,
419
+ "logps/chosen": -403.69097900390625,
420
+ "logps/rejected": -277.13079833984375,
421
+ "loss": 0.4291,
422
+ "positive_losses": 0.1399887055158615,
423
+ "rewards/accuracies": 1.0,
424
+ "rewards/chosen": 0.524212658405304,
425
+ "rewards/margins": 1.0702520608901978,
426
+ "rewards/margins_max": 1.2478736639022827,
427
+ "rewards/margins_min": 0.892630398273468,
428
+ "rewards/margins_std": 0.25119495391845703,
429
+ "rewards/rejected": -0.546039342880249,
430
+ "step": 190
431
+ },
432
+ {
433
+ "dpo_losses": 0.340221643447876,
434
+ "epoch": 0.56,
435
+ "grad_norm": 39.20326200653621,
436
+ "learning_rate": 4.884634529493591e-06,
437
+ "logits/chosen": -2.686732769012451,
438
+ "logits/rejected": -2.5898447036743164,
439
+ "logps/chosen": -290.22259521484375,
440
+ "logps/rejected": -267.71923828125,
441
+ "loss": 0.4843,
442
+ "positive_losses": 1.2471590042114258,
443
+ "rewards/accuracies": 0.949999988079071,
444
+ "rewards/chosen": 0.405730664730072,
445
+ "rewards/margins": 1.0374891757965088,
446
+ "rewards/margins_max": 1.3196719884872437,
447
+ "rewards/margins_min": 0.7553063035011292,
448
+ "rewards/margins_std": 0.3990669250488281,
449
+ "rewards/rejected": -0.6317585110664368,
450
+ "step": 200
451
+ },
452
+ {
453
+ "epoch": 0.56,
454
+ "eval_dpo_losses": 0.6414885520935059,
455
+ "eval_logits/chosen": -2.6000208854675293,
456
+ "eval_logits/rejected": -2.5589327812194824,
457
+ "eval_logps/chosen": -310.5942687988281,
458
+ "eval_logps/rejected": -302.1485595703125,
459
+ "eval_loss": 3.6353535652160645,
460
+ "eval_positive_losses": 28.93216323852539,
461
+ "eval_rewards/accuracies": 0.6349206566810608,
462
+ "eval_rewards/chosen": -0.2537309527397156,
463
+ "eval_rewards/margins": 0.17592783272266388,
464
+ "eval_rewards/margins_max": 0.7364258766174316,
465
+ "eval_rewards/margins_min": -0.35331711173057556,
466
+ "eval_rewards/margins_std": 0.48577553033828735,
467
+ "eval_rewards/rejected": -0.42965877056121826,
468
+ "eval_runtime": 280.7482,
469
+ "eval_samples_per_second": 7.124,
470
+ "eval_steps_per_second": 0.224,
471
+ "step": 200
472
+ },
473
+ {
474
+ "dpo_losses": 0.4035143256187439,
475
+ "epoch": 0.59,
476
+ "grad_norm": 2.0960462134704727,
477
+ "learning_rate": 4.858739661052539e-06,
478
+ "logits/chosen": -2.449962854385376,
479
+ "logits/rejected": -2.364846706390381,
480
+ "logps/chosen": -316.26300048828125,
481
+ "logps/rejected": -311.43115234375,
482
+ "loss": 0.3993,
483
+ "positive_losses": 0.28444308042526245,
484
+ "rewards/accuracies": 0.949999988079071,
485
+ "rewards/chosen": 0.4057347774505615,
486
+ "rewards/margins": 0.8319534063339233,
487
+ "rewards/margins_max": 1.267531156539917,
488
+ "rewards/margins_min": 0.3963755965232849,
489
+ "rewards/margins_std": 0.6160000562667847,
490
+ "rewards/rejected": -0.4262186586856842,
491
+ "step": 210
492
+ },
493
+ {
494
+ "dpo_losses": 0.2952147424221039,
495
+ "epoch": 0.62,
496
+ "grad_norm": 2.161286043094922,
497
+ "learning_rate": 4.830308438912687e-06,
498
+ "logits/chosen": -2.7107901573181152,
499
+ "logits/rejected": -2.560708522796631,
500
+ "logps/chosen": -364.57659912109375,
501
+ "logps/rejected": -361.52545166015625,
502
+ "loss": 0.3728,
503
+ "positive_losses": 0.0,
504
+ "rewards/accuracies": 1.0,
505
+ "rewards/chosen": 0.4469282031059265,
506
+ "rewards/margins": 1.2792309522628784,
507
+ "rewards/margins_max": 1.8157857656478882,
508
+ "rewards/margins_min": 0.7426761388778687,
509
+ "rewards/margins_std": 0.7588031888008118,
510
+ "rewards/rejected": -0.8323026895523071,
511
+ "step": 220
512
+ },
513
+ {
514
+ "dpo_losses": 0.26847127079963684,
515
+ "epoch": 0.65,
516
+ "grad_norm": 14.213158348490241,
517
+ "learning_rate": 4.799371435178544e-06,
518
+ "logits/chosen": -2.694364070892334,
519
+ "logits/rejected": -2.611956834793091,
520
+ "logps/chosen": -353.65203857421875,
521
+ "logps/rejected": -447.9458923339844,
522
+ "loss": 0.4538,
523
+ "positive_losses": 0.512774646282196,
524
+ "rewards/accuracies": 1.0,
525
+ "rewards/chosen": 0.2710634768009186,
526
+ "rewards/margins": 1.3846604824066162,
527
+ "rewards/margins_max": 1.9593979120254517,
528
+ "rewards/margins_min": 0.809922993183136,
529
+ "rewards/margins_std": 0.8128012418746948,
530
+ "rewards/rejected": -1.113596796989441,
531
+ "step": 230
532
+ },
533
+ {
534
+ "dpo_losses": 0.3169272541999817,
535
+ "epoch": 0.68,
536
+ "grad_norm": 19.218706332521528,
537
+ "learning_rate": 4.765961916422575e-06,
538
+ "logits/chosen": -2.664609670639038,
539
+ "logits/rejected": -2.5199811458587646,
540
+ "logps/chosen": -309.8558349609375,
541
+ "logps/rejected": -378.1949157714844,
542
+ "loss": 0.4789,
543
+ "positive_losses": 2.6823112964630127,
544
+ "rewards/accuracies": 0.949999988079071,
545
+ "rewards/chosen": 0.3129195272922516,
546
+ "rewards/margins": 1.156752586364746,
547
+ "rewards/margins_max": 1.4397625923156738,
548
+ "rewards/margins_min": 0.8737425804138184,
549
+ "rewards/margins_std": 0.4002366065979004,
550
+ "rewards/rejected": -0.8438330888748169,
551
+ "step": 240
552
+ },
553
+ {
554
+ "dpo_losses": 0.2593916356563568,
555
+ "epoch": 0.7,
556
+ "grad_norm": 10.943604253629541,
557
+ "learning_rate": 4.730115807913627e-06,
558
+ "logits/chosen": -2.6513824462890625,
559
+ "logits/rejected": -2.482832431793213,
560
+ "logps/chosen": -369.9505920410156,
561
+ "logps/rejected": -348.71759033203125,
562
+ "loss": 0.3659,
563
+ "positive_losses": 0.7899643182754517,
564
+ "rewards/accuracies": 1.0,
565
+ "rewards/chosen": 0.45276421308517456,
566
+ "rewards/margins": 1.3846436738967896,
567
+ "rewards/margins_max": 1.6470266580581665,
568
+ "rewards/margins_min": 1.1222608089447021,
569
+ "rewards/margins_std": 0.3710656762123108,
570
+ "rewards/rejected": -0.9318795204162598,
571
+ "step": 250
572
+ },
573
+ {
574
+ "dpo_losses": 0.3340635895729065,
575
+ "epoch": 0.73,
576
+ "grad_norm": 12.721859863392186,
577
+ "learning_rate": 4.691871654986485e-06,
578
+ "logits/chosen": -2.592149019241333,
579
+ "logits/rejected": -2.553668975830078,
580
+ "logps/chosen": -290.64971923828125,
581
+ "logps/rejected": -312.02752685546875,
582
+ "loss": 0.5868,
583
+ "positive_losses": 1.0232231616973877,
584
+ "rewards/accuracies": 1.0,
585
+ "rewards/chosen": 0.3117257356643677,
586
+ "rewards/margins": 1.1895887851715088,
587
+ "rewards/margins_max": 1.7240755558013916,
588
+ "rewards/margins_min": 0.6551022529602051,
589
+ "rewards/margins_std": 0.755878210067749,
590
+ "rewards/rejected": -0.8778631091117859,
591
+ "step": 260
592
+ },
593
+ {
594
+ "dpo_losses": 0.2871754467487335,
595
+ "epoch": 0.76,
596
+ "grad_norm": 20.74321957766599,
597
+ "learning_rate": 4.651270581594054e-06,
598
+ "logits/chosen": -2.6253020763397217,
599
+ "logits/rejected": -2.486907482147217,
600
+ "logps/chosen": -381.5191955566406,
601
+ "logps/rejected": -312.0382995605469,
602
+ "loss": 0.4914,
603
+ "positive_losses": 0.877484142780304,
604
+ "rewards/accuracies": 1.0,
605
+ "rewards/chosen": 0.4747743606567383,
606
+ "rewards/margins": 1.2697714567184448,
607
+ "rewards/margins_max": 1.6828867197036743,
608
+ "rewards/margins_min": 0.8566561937332153,
609
+ "rewards/margins_std": 0.5842332243919373,
610
+ "rewards/rejected": -0.7949970960617065,
611
+ "step": 270
612
+ },
613
+ {
614
+ "dpo_losses": 0.325231671333313,
615
+ "epoch": 0.79,
616
+ "grad_norm": 24.63939637076414,
617
+ "learning_rate": 4.6083562460867545e-06,
618
+ "logits/chosen": -2.4967987537384033,
619
+ "logits/rejected": -2.4599080085754395,
620
+ "logps/chosen": -295.0630798339844,
621
+ "logps/rejected": -352.3880615234375,
622
+ "loss": 0.4194,
623
+ "positive_losses": 1.5066649913787842,
624
+ "rewards/accuracies": 0.949999988079071,
625
+ "rewards/chosen": 0.34013646841049194,
626
+ "rewards/margins": 1.2071036100387573,
627
+ "rewards/margins_max": 1.8113868236541748,
628
+ "rewards/margins_min": 0.6028203964233398,
629
+ "rewards/margins_std": 0.8545855283737183,
630
+ "rewards/rejected": -0.8669670820236206,
631
+ "step": 280
632
+ },
633
+ {
634
+ "dpo_losses": 0.29185962677001953,
635
+ "epoch": 0.82,
636
+ "grad_norm": 49.4876607292477,
637
+ "learning_rate": 4.563174794266684e-06,
638
+ "logits/chosen": -2.725825071334839,
639
+ "logits/rejected": -2.5793709754943848,
640
+ "logps/chosen": -316.76153564453125,
641
+ "logps/rejected": -357.36334228515625,
642
+ "loss": 0.4944,
643
+ "positive_losses": 1.7035411596298218,
644
+ "rewards/accuracies": 1.0,
645
+ "rewards/chosen": 0.45084109902381897,
646
+ "rewards/margins": 1.3104431629180908,
647
+ "rewards/margins_max": 1.7685630321502686,
648
+ "rewards/margins_min": 0.8523231744766235,
649
+ "rewards/margins_std": 0.6478795409202576,
650
+ "rewards/rejected": -0.859602153301239,
651
+ "step": 290
652
+ },
653
+ {
654
+ "dpo_losses": 0.24671301245689392,
655
+ "epoch": 0.85,
656
+ "grad_norm": 38.65216195794793,
657
+ "learning_rate": 4.5157748097670125e-06,
658
+ "logits/chosen": -2.6136484146118164,
659
+ "logits/rejected": -2.5544955730438232,
660
+ "logps/chosen": -294.6103820800781,
661
+ "logps/rejected": -422.00531005859375,
662
+ "loss": 0.2828,
663
+ "positive_losses": 0.0,
664
+ "rewards/accuracies": 1.0,
665
+ "rewards/chosen": 0.43132978677749634,
666
+ "rewards/margins": 1.5242297649383545,
667
+ "rewards/margins_max": 1.9953060150146484,
668
+ "rewards/margins_min": 1.0531535148620605,
669
+ "rewards/margins_std": 0.6662023067474365,
670
+ "rewards/rejected": -1.092900037765503,
671
+ "step": 300
672
+ },
673
+ {
674
+ "epoch": 0.85,
675
+ "eval_dpo_losses": 0.6346198320388794,
676
+ "eval_logits/chosen": -2.5758955478668213,
677
+ "eval_logits/rejected": -2.534869909286499,
678
+ "eval_logps/chosen": -345.25262451171875,
679
+ "eval_logps/rejected": -344.2117004394531,
680
+ "eval_loss": 6.804558753967285,
681
+ "eval_positive_losses": 61.76890182495117,
682
+ "eval_rewards/accuracies": 0.6507936716079712,
683
+ "eval_rewards/chosen": -0.6003143787384033,
684
+ "eval_rewards/margins": 0.24997644126415253,
685
+ "eval_rewards/margins_max": 1.0084712505340576,
686
+ "eval_rewards/margins_min": -0.48679542541503906,
687
+ "eval_rewards/margins_std": 0.6679101586341858,
688
+ "eval_rewards/rejected": -0.8502907156944275,
689
+ "eval_runtime": 281.0888,
690
+ "eval_samples_per_second": 7.115,
691
+ "eval_steps_per_second": 0.224,
692
+ "step": 300
693
+ },
694
+ {
695
+ "dpo_losses": 0.2701479196548462,
696
+ "epoch": 0.87,
697
+ "grad_norm": 2.3545752201762977,
698
+ "learning_rate": 4.466207261809989e-06,
699
+ "logits/chosen": -2.7792999744415283,
700
+ "logits/rejected": -2.558169364929199,
701
+ "logps/chosen": -376.7308044433594,
702
+ "logps/rejected": -358.2815246582031,
703
+ "loss": 0.4216,
704
+ "positive_losses": 0.41081467270851135,
705
+ "rewards/accuracies": 1.0,
706
+ "rewards/chosen": 0.47516530752182007,
707
+ "rewards/margins": 1.4481419324874878,
708
+ "rewards/margins_max": 1.9792436361312866,
709
+ "rewards/margins_min": 0.9170401692390442,
710
+ "rewards/margins_std": 0.7510912418365479,
711
+ "rewards/rejected": -0.972976565361023,
712
+ "step": 310
713
+ },
714
+ {
715
+ "dpo_losses": 0.2924317717552185,
716
+ "epoch": 0.9,
717
+ "grad_norm": 12.007334692835927,
718
+ "learning_rate": 4.414525450399713e-06,
719
+ "logits/chosen": -2.6015143394470215,
720
+ "logits/rejected": -2.5146775245666504,
721
+ "logps/chosen": -315.1227111816406,
722
+ "logps/rejected": -350.9872131347656,
723
+ "loss": 0.3361,
724
+ "positive_losses": 2.5894155502319336,
725
+ "rewards/accuracies": 0.949999988079071,
726
+ "rewards/chosen": 0.3113010823726654,
727
+ "rewards/margins": 1.4402223825454712,
728
+ "rewards/margins_max": 2.083007335662842,
729
+ "rewards/margins_min": 0.7974374890327454,
730
+ "rewards/margins_std": 0.9090349078178406,
731
+ "rewards/rejected": -1.128921389579773,
732
+ "step": 320
733
+ },
734
+ {
735
+ "dpo_losses": 0.2676263153553009,
736
+ "epoch": 0.93,
737
+ "grad_norm": 29.603619163509805,
738
+ "learning_rate": 4.360784949008615e-06,
739
+ "logits/chosen": -2.705920934677124,
740
+ "logits/rejected": -2.5720715522766113,
741
+ "logps/chosen": -338.70623779296875,
742
+ "logps/rejected": -376.7459716796875,
743
+ "loss": 0.4384,
744
+ "positive_losses": 2.4138710498809814,
745
+ "rewards/accuracies": 1.0,
746
+ "rewards/chosen": 0.40980419516563416,
747
+ "rewards/margins": 1.7029374837875366,
748
+ "rewards/margins_max": 2.3179752826690674,
749
+ "rewards/margins_min": 1.087899923324585,
750
+ "rewards/margins_std": 0.8697945475578308,
751
+ "rewards/rejected": -1.2931334972381592,
752
+ "step": 330
753
+ },
754
+ {
755
+ "dpo_losses": 0.3142815828323364,
756
+ "epoch": 0.96,
757
+ "grad_norm": 18.502276306325797,
758
+ "learning_rate": 4.30504354481929e-06,
759
+ "logits/chosen": -2.5686209201812744,
760
+ "logits/rejected": -2.424360990524292,
761
+ "logps/chosen": -333.12530517578125,
762
+ "logps/rejected": -292.06707763671875,
763
+ "loss": 0.3621,
764
+ "positive_losses": 0.5738517642021179,
765
+ "rewards/accuracies": 1.0,
766
+ "rewards/chosen": 0.3537023365497589,
767
+ "rewards/margins": 1.2313454151153564,
768
+ "rewards/margins_max": 1.7322998046875,
769
+ "rewards/margins_min": 0.7303910255432129,
770
+ "rewards/margins_std": 0.7084565758705139,
771
+ "rewards/rejected": -0.8776431083679199,
772
+ "step": 340
773
+ },
774
+ {
775
+ "dpo_losses": 0.18595710396766663,
776
+ "epoch": 0.99,
777
+ "grad_norm": 137.9970915114699,
778
+ "learning_rate": 4.247361176585904e-06,
779
+ "logits/chosen": -2.5403497219085693,
780
+ "logits/rejected": -2.4443917274475098,
781
+ "logps/chosen": -386.3079833984375,
782
+ "logps/rejected": -441.9620056152344,
783
+ "loss": 0.4456,
784
+ "positive_losses": 0.9461520910263062,
785
+ "rewards/accuracies": 1.0,
786
+ "rewards/chosen": 0.4209546446800232,
787
+ "rewards/margins": 1.8566910028457642,
788
+ "rewards/margins_max": 2.370370388031006,
789
+ "rewards/margins_min": 1.3430118560791016,
790
+ "rewards/margins_std": 0.7264522910118103,
791
+ "rewards/rejected": -1.4357364177703857,
792
+ "step": 350
793
+ },
794
+ {
795
+ "dpo_losses": 0.24505066871643066,
796
+ "epoch": 1.01,
797
+ "grad_norm": 3.181861527878417,
798
+ "learning_rate": 4.187799870182038e-06,
799
+ "logits/chosen": -2.5239622592926025,
800
+ "logits/rejected": -2.4095091819763184,
801
+ "logps/chosen": -305.76092529296875,
802
+ "logps/rejected": -311.29388427734375,
803
+ "loss": 0.3141,
804
+ "positive_losses": 0.24010220170021057,
805
+ "rewards/accuracies": 1.0,
806
+ "rewards/chosen": 0.4044499397277832,
807
+ "rewards/margins": 1.4896382093429565,
808
+ "rewards/margins_max": 2.010220527648926,
809
+ "rewards/margins_min": 0.9690560102462769,
810
+ "rewards/margins_std": 0.7362144589424133,
811
+ "rewards/rejected": -1.0851882696151733,
812
+ "step": 360
813
+ },
814
+ {
815
+ "dpo_losses": 0.24201233685016632,
816
+ "epoch": 1.04,
817
+ "grad_norm": 54.166595052035994,
818
+ "learning_rate": 4.1264236719042365e-06,
819
+ "logits/chosen": -2.362025737762451,
820
+ "logits/rejected": -2.3736584186553955,
821
+ "logps/chosen": -307.57098388671875,
822
+ "logps/rejected": -404.64447021484375,
823
+ "loss": 0.3927,
824
+ "positive_losses": 0.0,
825
+ "rewards/accuracies": 1.0,
826
+ "rewards/chosen": 0.44086480140686035,
827
+ "rewards/margins": 1.8494741916656494,
828
+ "rewards/margins_max": 2.651545763015747,
829
+ "rewards/margins_min": 1.0474025011062622,
830
+ "rewards/margins_std": 1.1343004703521729,
831
+ "rewards/rejected": -1.40860915184021,
832
+ "step": 370
833
+ },
834
+ {
835
+ "dpo_losses": 0.22759906947612762,
836
+ "epoch": 1.07,
837
+ "grad_norm": 38.27158679057807,
838
+ "learning_rate": 4.063298579603001e-06,
839
+ "logits/chosen": -2.472991466522217,
840
+ "logits/rejected": -2.2599282264709473,
841
+ "logps/chosen": -351.54620361328125,
842
+ "logps/rejected": -319.0135803222656,
843
+ "loss": 0.3817,
844
+ "positive_losses": 3.08628511428833,
845
+ "rewards/accuracies": 1.0,
846
+ "rewards/chosen": 0.3943432867527008,
847
+ "rewards/margins": 1.7396957874298096,
848
+ "rewards/margins_max": 2.3970978260040283,
849
+ "rewards/margins_min": 1.0822933912277222,
850
+ "rewards/margins_std": 0.9297070503234863,
851
+ "rewards/rejected": -1.3453524112701416,
852
+ "step": 380
853
+ },
854
+ {
855
+ "dpo_losses": 0.134817972779274,
856
+ "epoch": 1.1,
857
+ "grad_norm": 2.791305623027733,
858
+ "learning_rate": 3.998492471715272e-06,
859
+ "logits/chosen": -2.496638059616089,
860
+ "logits/rejected": -2.455773115158081,
861
+ "logps/chosen": -349.44073486328125,
862
+ "logps/rejected": -536.924560546875,
863
+ "loss": 0.3072,
864
+ "positive_losses": 0.0,
865
+ "rewards/accuracies": 1.0,
866
+ "rewards/chosen": 0.4807213246822357,
867
+ "rewards/margins": 2.533548355102539,
868
+ "rewards/margins_max": 3.3716654777526855,
869
+ "rewards/margins_min": 1.695431113243103,
870
+ "rewards/margins_std": 1.185276746749878,
871
+ "rewards/rejected": -2.0528271198272705,
872
+ "step": 390
873
+ },
874
+ {
875
+ "dpo_losses": 0.23253802955150604,
876
+ "epoch": 1.13,
877
+ "grad_norm": 1.567698262162245,
878
+ "learning_rate": 3.932075034274723e-06,
879
+ "logits/chosen": -2.385937213897705,
880
+ "logits/rejected": -2.3955626487731934,
881
+ "logps/chosen": -266.9366455078125,
882
+ "logps/rejected": -367.39764404296875,
883
+ "loss": 0.3355,
884
+ "positive_losses": 0.33393630385398865,
885
+ "rewards/accuracies": 1.0,
886
+ "rewards/chosen": 0.3180273175239563,
887
+ "rewards/margins": 1.719641923904419,
888
+ "rewards/margins_max": 2.0190768241882324,
889
+ "rewards/margins_min": 1.4202073812484741,
890
+ "rewards/margins_std": 0.4234645366668701,
891
+ "rewards/rejected": -1.401614785194397,
892
+ "step": 400
893
+ },
894
+ {
895
+ "epoch": 1.13,
896
+ "eval_dpo_losses": 0.6571967601776123,
897
+ "eval_logits/chosen": -2.415469169616699,
898
+ "eval_logits/rejected": -2.3773038387298584,
899
+ "eval_logps/chosen": -392.8340759277344,
900
+ "eval_logps/rejected": -401.27020263671875,
901
+ "eval_loss": 11.415841102600098,
902
+ "eval_positive_losses": 108.73988342285156,
903
+ "eval_rewards/accuracies": 0.6547619104385376,
904
+ "eval_rewards/chosen": -1.0761287212371826,
905
+ "eval_rewards/margins": 0.34474655985832214,
906
+ "eval_rewards/margins_max": 1.462611198425293,
907
+ "eval_rewards/margins_min": -0.7661140561103821,
908
+ "eval_rewards/margins_std": 0.9968340992927551,
909
+ "eval_rewards/rejected": -1.4208753108978271,
910
+ "eval_runtime": 282.7743,
911
+ "eval_samples_per_second": 7.073,
912
+ "eval_steps_per_second": 0.223,
913
+ "step": 400
914
+ },
915
+ {
916
+ "dpo_losses": 0.23369018733501434,
917
+ "epoch": 1.15,
918
+ "grad_norm": 1.6752309896133104,
919
+ "learning_rate": 3.864117685978339e-06,
920
+ "logits/chosen": -2.4605586528778076,
921
+ "logits/rejected": -2.4261014461517334,
922
+ "logps/chosen": -264.14556884765625,
923
+ "logps/rejected": -375.3333740234375,
924
+ "loss": 0.3689,
925
+ "positive_losses": 3.3056740760803223,
926
+ "rewards/accuracies": 0.949999988079071,
927
+ "rewards/chosen": 0.3054488003253937,
928
+ "rewards/margins": 1.9768092632293701,
929
+ "rewards/margins_max": 2.963430166244507,
930
+ "rewards/margins_min": 0.990188479423523,
931
+ "rewards/margins_std": 1.3952926397323608,
932
+ "rewards/rejected": -1.6713603734970093,
933
+ "step": 410
934
+ },
935
+ {
936
+ "dpo_losses": 0.1611286997795105,
937
+ "epoch": 1.18,
938
+ "grad_norm": 19.99801702205629,
939
+ "learning_rate": 3.794693501389861e-06,
940
+ "logits/chosen": -2.549091100692749,
941
+ "logits/rejected": -2.4567952156066895,
942
+ "logps/chosen": -349.955810546875,
943
+ "logps/rejected": -440.55987548828125,
944
+ "loss": 0.3233,
945
+ "positive_losses": 0.5550443530082703,
946
+ "rewards/accuracies": 1.0,
947
+ "rewards/chosen": 0.4979848861694336,
948
+ "rewards/margins": 2.247861385345459,
949
+ "rewards/margins_max": 3.1490349769592285,
950
+ "rewards/margins_min": 1.3466877937316895,
951
+ "rewards/margins_std": 1.2744518518447876,
952
+ "rewards/rejected": -1.7498763799667358,
953
+ "step": 420
954
+ },
955
+ {
956
+ "dpo_losses": 0.18394428491592407,
957
+ "epoch": 1.21,
958
+ "grad_norm": 89.7451007324007,
959
+ "learning_rate": 3.7238771323626822e-06,
960
+ "logits/chosen": -2.5034220218658447,
961
+ "logits/rejected": -2.4015450477600098,
962
+ "logps/chosen": -339.3304748535156,
963
+ "logps/rejected": -436.32769775390625,
964
+ "loss": 0.3026,
965
+ "positive_losses": 0.09120301902294159,
966
+ "rewards/accuracies": 1.0,
967
+ "rewards/chosen": 0.4775872230529785,
968
+ "rewards/margins": 2.2062175273895264,
969
+ "rewards/margins_max": 2.9012348651885986,
970
+ "rewards/margins_min": 1.5112000703811646,
971
+ "rewards/margins_std": 0.9829031229019165,
972
+ "rewards/rejected": -1.7286304235458374,
973
+ "step": 430
974
+ },
975
+ {
976
+ "dpo_losses": 0.18616007268428802,
977
+ "epoch": 1.24,
978
+ "grad_norm": 3.3128247371778987,
979
+ "learning_rate": 3.651744727766676e-06,
980
+ "logits/chosen": -2.5054969787597656,
981
+ "logits/rejected": -2.38596773147583,
982
+ "logps/chosen": -275.51483154296875,
983
+ "logps/rejected": -352.8285827636719,
984
+ "loss": 0.3883,
985
+ "positive_losses": 0.630480945110321,
986
+ "rewards/accuracies": 1.0,
987
+ "rewards/chosen": 0.39738765358924866,
988
+ "rewards/margins": 1.9772555828094482,
989
+ "rewards/margins_max": 2.764988422393799,
990
+ "rewards/margins_min": 1.1895228624343872,
991
+ "rewards/margins_std": 1.1140224933624268,
992
+ "rewards/rejected": -1.579867959022522,
993
+ "step": 440
994
+ },
995
+ {
996
+ "dpo_losses": 0.16775104403495789,
997
+ "epoch": 1.27,
998
+ "grad_norm": 2.373995116616861,
999
+ "learning_rate": 3.57837385160529e-06,
1000
+ "logits/chosen": -2.431580066680908,
1001
+ "logits/rejected": -2.3474247455596924,
1002
+ "logps/chosen": -304.8125915527344,
1003
+ "logps/rejected": -460.9014587402344,
1004
+ "loss": 0.3246,
1005
+ "positive_losses": 2.069793224334717,
1006
+ "rewards/accuracies": 1.0,
1007
+ "rewards/chosen": 0.33361926674842834,
1008
+ "rewards/margins": 2.2662878036499023,
1009
+ "rewards/margins_max": 2.9109106063842773,
1010
+ "rewards/margins_min": 1.6216650009155273,
1011
+ "rewards/margins_std": 0.9116341471672058,
1012
+ "rewards/rejected": -1.9326684474945068,
1013
+ "step": 450
1014
+ },
1015
+ {
1016
+ "dpo_losses": 0.13223373889923096,
1017
+ "epoch": 1.3,
1018
+ "grad_norm": 43.7205383502172,
1019
+ "learning_rate": 3.503843399610941e-06,
1020
+ "logits/chosen": -2.416710615158081,
1021
+ "logits/rejected": -2.4022932052612305,
1022
+ "logps/chosen": -355.0602111816406,
1023
+ "logps/rejected": -625.78369140625,
1024
+ "loss": 0.3215,
1025
+ "positive_losses": 3.091512680053711,
1026
+ "rewards/accuracies": 1.0,
1027
+ "rewards/chosen": 0.40217727422714233,
1028
+ "rewards/margins": 2.6340858936309814,
1029
+ "rewards/margins_max": 3.5210165977478027,
1030
+ "rewards/margins_min": 1.7471544742584229,
1031
+ "rewards/margins_std": 1.2543103694915771,
1032
+ "rewards/rejected": -2.2319083213806152,
1033
+ "step": 460
1034
+ },
1035
+ {
1036
+ "dpo_losses": 0.13370443880558014,
1037
+ "epoch": 1.32,
1038
+ "grad_norm": 93.10254553112391,
1039
+ "learning_rate": 3.4282335144083985e-06,
1040
+ "logits/chosen": -2.4071993827819824,
1041
+ "logits/rejected": -2.301910638809204,
1042
+ "logps/chosen": -323.6948547363281,
1043
+ "logps/rejected": -422.40106201171875,
1044
+ "loss": 0.3844,
1045
+ "positive_losses": 0.8020246624946594,
1046
+ "rewards/accuracies": 1.0,
1047
+ "rewards/chosen": 0.33555978536605835,
1048
+ "rewards/margins": 2.352768659591675,
1049
+ "rewards/margins_max": 2.6321401596069336,
1050
+ "rewards/margins_min": 2.073397159576416,
1051
+ "rewards/margins_std": 0.395090788602829,
1052
+ "rewards/rejected": -2.017209053039551,
1053
+ "step": 470
1054
+ },
1055
+ {
1056
+ "dpo_losses": 0.16256804764270782,
1057
+ "epoch": 1.35,
1058
+ "grad_norm": 25.781294981021368,
1059
+ "learning_rate": 3.351625499337395e-06,
1060
+ "logits/chosen": -2.5639257431030273,
1061
+ "logits/rejected": -2.3825831413269043,
1062
+ "logps/chosen": -368.14764404296875,
1063
+ "logps/rejected": -455.31170654296875,
1064
+ "loss": 0.3346,
1065
+ "positive_losses": 5.322105407714844,
1066
+ "rewards/accuracies": 1.0,
1067
+ "rewards/chosen": 0.47703951597213745,
1068
+ "rewards/margins": 2.266021251678467,
1069
+ "rewards/margins_max": 2.826010227203369,
1070
+ "rewards/margins_min": 1.706032156944275,
1071
+ "rewards/margins_std": 0.791944146156311,
1072
+ "rewards/rejected": -1.7889817953109741,
1073
+ "step": 480
1074
+ },
1075
+ {
1076
+ "dpo_losses": 0.27087026834487915,
1077
+ "epoch": 1.38,
1078
+ "grad_norm": 1.9694210301585493,
1079
+ "learning_rate": 3.2741017310271056e-06,
1080
+ "logits/chosen": -2.434044361114502,
1081
+ "logits/rejected": -2.2838988304138184,
1082
+ "logps/chosen": -185.84713745117188,
1083
+ "logps/rejected": -353.2933044433594,
1084
+ "loss": 0.4342,
1085
+ "positive_losses": 2.383211851119995,
1086
+ "rewards/accuracies": 0.949999988079071,
1087
+ "rewards/chosen": 0.31434357166290283,
1088
+ "rewards/margins": 1.6815185546875,
1089
+ "rewards/margins_max": 2.3663582801818848,
1090
+ "rewards/margins_min": 0.9966787099838257,
1091
+ "rewards/margins_std": 0.9685096740722656,
1092
+ "rewards/rejected": -1.3671748638153076,
1093
+ "step": 490
1094
+ },
1095
+ {
1096
+ "dpo_losses": 0.1878516972064972,
1097
+ "epoch": 1.41,
1098
+ "grad_norm": 37.46970781719882,
1099
+ "learning_rate": 3.195745570816532e-06,
1100
+ "logits/chosen": -2.3778724670410156,
1101
+ "logits/rejected": -2.298856258392334,
1102
+ "logps/chosen": -339.0552978515625,
1103
+ "logps/rejected": -400.7374572753906,
1104
+ "loss": 0.3438,
1105
+ "positive_losses": 1.1473572254180908,
1106
+ "rewards/accuracies": 1.0,
1107
+ "rewards/chosen": 0.4514932632446289,
1108
+ "rewards/margins": 2.0797715187072754,
1109
+ "rewards/margins_max": 2.5511765480041504,
1110
+ "rewards/margins_min": 1.6083663702011108,
1111
+ "rewards/margins_std": 0.6666676998138428,
1112
+ "rewards/rejected": -1.628278136253357,
1113
+ "step": 500
1114
+ },
1115
+ {
1116
+ "epoch": 1.41,
1117
+ "eval_dpo_losses": 0.6380993127822876,
1118
+ "eval_logits/chosen": -2.490675449371338,
1119
+ "eval_logits/rejected": -2.4471161365509033,
1120
+ "eval_logps/chosen": -385.29376220703125,
1121
+ "eval_logps/rejected": -393.24566650390625,
1122
+ "eval_loss": 10.641342163085938,
1123
+ "eval_positive_losses": 101.35254669189453,
1124
+ "eval_rewards/accuracies": 0.6865079402923584,
1125
+ "eval_rewards/chosen": -1.0007256269454956,
1126
+ "eval_rewards/margins": 0.33990418910980225,
1127
+ "eval_rewards/margins_max": 1.3353358507156372,
1128
+ "eval_rewards/margins_min": -0.6337663531303406,
1129
+ "eval_rewards/margins_std": 0.8805232644081116,
1130
+ "eval_rewards/rejected": -1.3406296968460083,
1131
+ "eval_runtime": 281.3613,
1132
+ "eval_samples_per_second": 7.108,
1133
+ "eval_steps_per_second": 0.224,
1134
+ "step": 500
1135
+ },
1136
+ {
1137
+ "dpo_losses": 0.15203723311424255,
1138
+ "epoch": 1.44,
1139
+ "grad_norm": 7.547244740150183,
1140
+ "learning_rate": 3.116641275116018e-06,
1141
+ "logits/chosen": -2.2004570960998535,
1142
+ "logits/rejected": -2.1605918407440186,
1143
+ "logps/chosen": -256.17730712890625,
1144
+ "logps/rejected": -509.5887756347656,
1145
+ "loss": 0.3429,
1146
+ "positive_losses": 2.6612415313720703,
1147
+ "rewards/accuracies": 1.0,
1148
+ "rewards/chosen": 0.3650303781032562,
1149
+ "rewards/margins": 2.308377504348755,
1150
+ "rewards/margins_max": 3.1350021362304688,
1151
+ "rewards/margins_min": 1.481752634048462,
1152
+ "rewards/margins_std": 1.1690237522125244,
1153
+ "rewards/rejected": -1.9433467388153076,
1154
+ "step": 510
1155
+ },
1156
+ {
1157
+ "dpo_losses": 0.12489266693592072,
1158
+ "epoch": 1.46,
1159
+ "grad_norm": 145.8887364997723,
1160
+ "learning_rate": 3.0368739048062956e-06,
1161
+ "logits/chosen": -2.448385238647461,
1162
+ "logits/rejected": -2.354788064956665,
1163
+ "logps/chosen": -307.94976806640625,
1164
+ "logps/rejected": -450.015380859375,
1165
+ "loss": 0.5303,
1166
+ "positive_losses": 3.1661620140075684,
1167
+ "rewards/accuracies": 1.0,
1168
+ "rewards/chosen": 0.3366628587245941,
1169
+ "rewards/margins": 2.504631519317627,
1170
+ "rewards/margins_max": 3.252946138381958,
1171
+ "rewards/margins_min": 1.7563165426254272,
1172
+ "rewards/margins_std": 1.058276891708374,
1173
+ "rewards/rejected": -2.16796875,
1174
+ "step": 520
1175
+ },
1176
+ {
1177
+ "dpo_losses": 0.14785408973693848,
1178
+ "epoch": 1.49,
1179
+ "grad_norm": 19.752652919627817,
1180
+ "learning_rate": 2.956529233772492e-06,
1181
+ "logits/chosen": -2.3587186336517334,
1182
+ "logits/rejected": -2.3671178817749023,
1183
+ "logps/chosen": -339.9403076171875,
1184
+ "logps/rejected": -447.1279296875,
1185
+ "loss": 0.2974,
1186
+ "positive_losses": 0.8550773859024048,
1187
+ "rewards/accuracies": 1.0,
1188
+ "rewards/chosen": 0.39065319299697876,
1189
+ "rewards/margins": 2.165048122406006,
1190
+ "rewards/margins_max": 2.7594637870788574,
1191
+ "rewards/margins_min": 1.5706324577331543,
1192
+ "rewards/margins_std": 0.8406306505203247,
1193
+ "rewards/rejected": -1.7743949890136719,
1194
+ "step": 530
1195
+ },
1196
+ {
1197
+ "dpo_losses": 0.1259264200925827,
1198
+ "epoch": 1.52,
1199
+ "grad_norm": 75.31648718698649,
1200
+ "learning_rate": 2.8756936566714317e-06,
1201
+ "logits/chosen": -2.5206494331359863,
1202
+ "logits/rejected": -2.379965305328369,
1203
+ "logps/chosen": -351.1200256347656,
1204
+ "logps/rejected": -426.8377990722656,
1205
+ "loss": 0.4729,
1206
+ "positive_losses": 1.8278591632843018,
1207
+ "rewards/accuracies": 1.0,
1208
+ "rewards/chosen": 0.48433223366737366,
1209
+ "rewards/margins": 2.3780269622802734,
1210
+ "rewards/margins_max": 2.926578998565674,
1211
+ "rewards/margins_min": 1.8294748067855835,
1212
+ "rewards/margins_std": 0.7757696509361267,
1213
+ "rewards/rejected": -1.893694281578064,
1214
+ "step": 540
1215
+ },
1216
+ {
1217
+ "dpo_losses": 0.20844757556915283,
1218
+ "epoch": 1.55,
1219
+ "grad_norm": 3.349676851087858,
1220
+ "learning_rate": 2.794454096031429e-06,
1221
+ "logits/chosen": -2.4806084632873535,
1222
+ "logits/rejected": -2.4055755138397217,
1223
+ "logps/chosen": -274.53900146484375,
1224
+ "logps/rejected": -464.46307373046875,
1225
+ "loss": 0.2124,
1226
+ "positive_losses": 1.989834189414978,
1227
+ "rewards/accuracies": 1.0,
1228
+ "rewards/chosen": 0.36409634351730347,
1229
+ "rewards/margins": 2.338207960128784,
1230
+ "rewards/margins_max": 3.5889182090759277,
1231
+ "rewards/margins_min": 1.0874969959259033,
1232
+ "rewards/margins_std": 1.7687723636627197,
1233
+ "rewards/rejected": -1.974111557006836,
1234
+ "step": 550
1235
+ },
1236
+ {
1237
+ "dpo_losses": 0.18082240223884583,
1238
+ "epoch": 1.58,
1239
+ "grad_norm": 2.0680076581049485,
1240
+ "learning_rate": 2.71289790878446e-06,
1241
+ "logits/chosen": -2.375046730041504,
1242
+ "logits/rejected": -2.316608190536499,
1243
+ "logps/chosen": -295.595703125,
1244
+ "logps/rejected": -556.7249755859375,
1245
+ "loss": 0.3351,
1246
+ "positive_losses": 4.852077484130859,
1247
+ "rewards/accuracies": 0.949999988079071,
1248
+ "rewards/chosen": 0.21087315678596497,
1249
+ "rewards/margins": 2.390120267868042,
1250
+ "rewards/margins_max": 3.270312786102295,
1251
+ "rewards/margins_min": 1.5099279880523682,
1252
+ "rewards/margins_std": 1.24478018283844,
1253
+ "rewards/rejected": -2.1792471408843994,
1254
+ "step": 560
1255
+ },
1256
+ {
1257
+ "dpo_losses": 0.11622228473424911,
1258
+ "epoch": 1.61,
1259
+ "grad_norm": 53.09814348729035,
1260
+ "learning_rate": 2.6311127923312156e-06,
1261
+ "logits/chosen": -2.38314151763916,
1262
+ "logits/rejected": -2.2578694820404053,
1263
+ "logps/chosen": -386.07366943359375,
1264
+ "logps/rejected": -540.0533447265625,
1265
+ "loss": 0.2485,
1266
+ "positive_losses": 0.0,
1267
+ "rewards/accuracies": 1.0,
1268
+ "rewards/chosen": 0.4612743854522705,
1269
+ "rewards/margins": 2.703238010406494,
1270
+ "rewards/margins_max": 3.3717334270477295,
1271
+ "rewards/margins_min": 2.0347423553466797,
1272
+ "rewards/margins_std": 0.9453955888748169,
1273
+ "rewards/rejected": -2.2419633865356445,
1274
+ "step": 570
1275
+ },
1276
+ {
1277
+ "dpo_losses": 0.11324157565832138,
1278
+ "epoch": 1.63,
1279
+ "grad_norm": 28.84783476798183,
1280
+ "learning_rate": 2.549186690240057e-06,
1281
+ "logits/chosen": -2.3538641929626465,
1282
+ "logits/rejected": -2.2661383152008057,
1283
+ "logps/chosen": -265.77618408203125,
1284
+ "logps/rejected": -461.02362060546875,
1285
+ "loss": 0.3852,
1286
+ "positive_losses": 0.5109559893608093,
1287
+ "rewards/accuracies": 0.949999988079071,
1288
+ "rewards/chosen": 0.326893150806427,
1289
+ "rewards/margins": 2.722642660140991,
1290
+ "rewards/margins_max": 3.37109375,
1291
+ "rewards/margins_min": 2.0741915702819824,
1292
+ "rewards/margins_std": 0.9170483350753784,
1293
+ "rewards/rejected": -2.395749568939209,
1294
+ "step": 580
1295
+ },
1296
+ {
1297
+ "dpo_losses": 0.19121481478214264,
1298
+ "epoch": 1.66,
1299
+ "grad_norm": 3.235430116905917,
1300
+ "learning_rate": 2.4672076976812548e-06,
1301
+ "logits/chosen": -2.256422281265259,
1302
+ "logits/rejected": -2.129117965698242,
1303
+ "logps/chosen": -335.0826721191406,
1304
+ "logps/rejected": -496.73138427734375,
1305
+ "loss": 0.3539,
1306
+ "positive_losses": 0.32223206758499146,
1307
+ "rewards/accuracies": 1.0,
1308
+ "rewards/chosen": 0.3417799472808838,
1309
+ "rewards/margins": 2.386916399002075,
1310
+ "rewards/margins_max": 3.4707369804382324,
1311
+ "rewards/margins_min": 1.3030953407287598,
1312
+ "rewards/margins_std": 1.5327543020248413,
1313
+ "rewards/rejected": -2.0451362133026123,
1314
+ "step": 590
1315
+ },
1316
+ {
1317
+ "dpo_losses": 0.2605344355106354,
1318
+ "epoch": 1.69,
1319
+ "grad_norm": 13.287666189135933,
1320
+ "learning_rate": 2.3852639666982218e-06,
1321
+ "logits/chosen": -2.4083638191223145,
1322
+ "logits/rejected": -2.3480491638183594,
1323
+ "logps/chosen": -243.49948120117188,
1324
+ "logps/rejected": -401.8130798339844,
1325
+ "loss": 0.2144,
1326
+ "positive_losses": 0.026648616418242455,
1327
+ "rewards/accuracies": 1.0,
1328
+ "rewards/chosen": 0.35153132677078247,
1329
+ "rewards/margins": 1.764525055885315,
1330
+ "rewards/margins_max": 2.218550205230713,
1331
+ "rewards/margins_min": 1.310499668121338,
1332
+ "rewards/margins_std": 0.642088770866394,
1333
+ "rewards/rejected": -1.4129936695098877,
1334
+ "step": 600
1335
+ },
1336
+ {
1337
+ "epoch": 1.69,
1338
+ "eval_dpo_losses": 0.6267468929290771,
1339
+ "eval_logits/chosen": -2.3438363075256348,
1340
+ "eval_logits/rejected": -2.302872896194458,
1341
+ "eval_logps/chosen": -363.3914489746094,
1342
+ "eval_logps/rejected": -380.5317687988281,
1343
+ "eval_loss": 8.589625358581543,
1344
+ "eval_positive_losses": 79.79975891113281,
1345
+ "eval_rewards/accuracies": 0.6865079402923584,
1346
+ "eval_rewards/chosen": -0.7817028760910034,
1347
+ "eval_rewards/margins": 0.4317886233329773,
1348
+ "eval_rewards/margins_max": 1.5951305627822876,
1349
+ "eval_rewards/margins_min": -0.6660595536231995,
1350
+ "eval_rewards/margins_std": 1.0046793222427368,
1351
+ "eval_rewards/rejected": -1.2134915590286255,
1352
+ "eval_runtime": 280.6736,
1353
+ "eval_samples_per_second": 7.126,
1354
+ "eval_steps_per_second": 0.224,
1355
+ "step": 600
1356
+ },
1357
+ {
1358
+ "dpo_losses": 0.21777740120887756,
1359
+ "epoch": 1.72,
1360
+ "grad_norm": 3.6630257967936934,
1361
+ "learning_rate": 2.303443611417584e-06,
1362
+ "logits/chosen": -2.2210001945495605,
1363
+ "logits/rejected": -2.135100841522217,
1364
+ "logps/chosen": -324.7200622558594,
1365
+ "logps/rejected": -427.8872985839844,
1366
+ "loss": 0.4399,
1367
+ "positive_losses": 1.8486969470977783,
1368
+ "rewards/accuracies": 1.0,
1369
+ "rewards/chosen": 0.4076939523220062,
1370
+ "rewards/margins": 2.0033507347106934,
1371
+ "rewards/margins_max": 2.8805928230285645,
1372
+ "rewards/margins_min": 1.12610924243927,
1373
+ "rewards/margins_std": 1.2406072616577148,
1374
+ "rewards/rejected": -1.5956569910049438,
1375
+ "step": 610
1376
+ },
1377
+ {
1378
+ "dpo_losses": 0.16951796412467957,
1379
+ "epoch": 1.75,
1380
+ "grad_norm": 171.16064616866518,
1381
+ "learning_rate": 2.2218346133000264e-06,
1382
+ "logits/chosen": -2.2120890617370605,
1383
+ "logits/rejected": -2.1098124980926514,
1384
+ "logps/chosen": -283.6782531738281,
1385
+ "logps/rejected": -384.42193603515625,
1386
+ "loss": 0.445,
1387
+ "positive_losses": 6.724704742431641,
1388
+ "rewards/accuracies": 0.949999988079071,
1389
+ "rewards/chosen": 0.2760510742664337,
1390
+ "rewards/margins": 2.14823317527771,
1391
+ "rewards/margins_max": 2.824097156524658,
1392
+ "rewards/margins_min": 1.4723690748214722,
1393
+ "rewards/margins_std": 0.9558159708976746,
1394
+ "rewards/rejected": -1.8721821308135986,
1395
+ "step": 620
1396
+ },
1397
+ {
1398
+ "dpo_losses": 0.15758368372917175,
1399
+ "epoch": 1.77,
1400
+ "grad_norm": 24.7162522936105,
1401
+ "learning_rate": 2.140524726533792e-06,
1402
+ "logits/chosen": -2.237668514251709,
1403
+ "logits/rejected": -2.1284663677215576,
1404
+ "logps/chosen": -338.96942138671875,
1405
+ "logps/rejected": -402.83697509765625,
1406
+ "loss": 0.3769,
1407
+ "positive_losses": 0.14499235153198242,
1408
+ "rewards/accuracies": 1.0,
1409
+ "rewards/chosen": 0.5579421520233154,
1410
+ "rewards/margins": 2.344921350479126,
1411
+ "rewards/margins_max": 2.9339561462402344,
1412
+ "rewards/margins_min": 1.7558867931365967,
1413
+ "rewards/margins_std": 0.833020806312561,
1414
+ "rewards/rejected": -1.7869793176651,
1415
+ "step": 630
1416
+ },
1417
+ {
1418
+ "dpo_losses": 0.18143755197525024,
1419
+ "epoch": 1.8,
1420
+ "grad_norm": 28.54900764303978,
1421
+ "learning_rate": 2.059601383672566e-06,
1422
+ "logits/chosen": -2.421058177947998,
1423
+ "logits/rejected": -2.359043598175049,
1424
+ "logps/chosen": -261.0716857910156,
1425
+ "logps/rejected": -371.5032653808594,
1426
+ "loss": 0.2764,
1427
+ "positive_losses": 0.5634332895278931,
1428
+ "rewards/accuracies": 1.0,
1429
+ "rewards/chosen": 0.4140376150608063,
1430
+ "rewards/margins": 2.1301121711730957,
1431
+ "rewards/margins_max": 2.458400011062622,
1432
+ "rewards/margins_min": 1.8018243312835693,
1433
+ "rewards/margins_std": 0.46426907181739807,
1434
+ "rewards/rejected": -1.7160745859146118,
1435
+ "step": 640
1436
+ },
1437
+ {
1438
+ "dpo_losses": 0.21564999222755432,
1439
+ "epoch": 1.83,
1440
+ "grad_norm": 43.347219579951144,
1441
+ "learning_rate": 1.9791516016192214e-06,
1442
+ "logits/chosen": -2.4876837730407715,
1443
+ "logits/rejected": -2.327549457550049,
1444
+ "logps/chosen": -288.0809020996094,
1445
+ "logps/rejected": -400.9809875488281,
1446
+ "loss": 0.2601,
1447
+ "positive_losses": 2.1945955753326416,
1448
+ "rewards/accuracies": 0.8999999761581421,
1449
+ "rewards/chosen": 0.4631883203983307,
1450
+ "rewards/margins": 2.244870662689209,
1451
+ "rewards/margins_max": 3.217092990875244,
1452
+ "rewards/margins_min": 1.2726480960845947,
1453
+ "rewards/margins_std": 1.3749301433563232,
1454
+ "rewards/rejected": -1.7816823720932007,
1455
+ "step": 650
1456
+ },
1457
+ {
1458
+ "dpo_losses": 0.21185970306396484,
1459
+ "epoch": 1.86,
1460
+ "grad_norm": 4.895621870086889,
1461
+ "learning_rate": 1.8992618880565039e-06,
1462
+ "logits/chosen": -2.147622585296631,
1463
+ "logits/rejected": -2.0906789302825928,
1464
+ "logps/chosen": -282.22711181640625,
1465
+ "logps/rejected": -362.41119384765625,
1466
+ "loss": 0.4916,
1467
+ "positive_losses": 3.9535961151123047,
1468
+ "rewards/accuracies": 1.0,
1469
+ "rewards/chosen": 0.4179634153842926,
1470
+ "rewards/margins": 2.141587495803833,
1471
+ "rewards/margins_max": 3.0650925636291504,
1472
+ "rewards/margins_min": 1.218082308769226,
1473
+ "rewards/margins_std": 1.3060333728790283,
1474
+ "rewards/rejected": -1.7236239910125732,
1475
+ "step": 660
1476
+ },
1477
+ {
1478
+ "dpo_losses": 0.11567674577236176,
1479
+ "epoch": 1.89,
1480
+ "grad_norm": 259.9155211313861,
1481
+ "learning_rate": 1.8200181484252888e-06,
1482
+ "logits/chosen": -2.390129566192627,
1483
+ "logits/rejected": -2.4018349647521973,
1484
+ "logps/chosen": -326.493896484375,
1485
+ "logps/rejected": -549.7818603515625,
1486
+ "loss": 0.2561,
1487
+ "positive_losses": 0.0,
1488
+ "rewards/accuracies": 1.0,
1489
+ "rewards/chosen": 0.562630295753479,
1490
+ "rewards/margins": 2.886990785598755,
1491
+ "rewards/margins_max": 3.560823440551758,
1492
+ "rewards/margins_min": 2.2131576538085938,
1493
+ "rewards/margins_std": 0.9529436826705933,
1494
+ "rewards/rejected": -2.3243603706359863,
1495
+ "step": 670
1496
+ },
1497
+ {
1498
+ "dpo_losses": 0.1450214684009552,
1499
+ "epoch": 1.92,
1500
+ "grad_norm": 3.1779421953498246,
1501
+ "learning_rate": 1.7415055935504234e-06,
1502
+ "logits/chosen": -2.4584078788757324,
1503
+ "logits/rejected": -2.2816786766052246,
1504
+ "logps/chosen": -351.3116760253906,
1505
+ "logps/rejected": -523.8319091796875,
1506
+ "loss": 0.4032,
1507
+ "positive_losses": 3.919013500213623,
1508
+ "rewards/accuracies": 1.0,
1509
+ "rewards/chosen": 0.4204772114753723,
1510
+ "rewards/margins": 2.6883883476257324,
1511
+ "rewards/margins_max": 3.6081409454345703,
1512
+ "rewards/margins_min": 1.7686359882354736,
1513
+ "rewards/margins_std": 1.300726294517517,
1514
+ "rewards/rejected": -2.267911434173584,
1515
+ "step": 680
1516
+ },
1517
+ {
1518
+ "dpo_losses": 0.22765210270881653,
1519
+ "epoch": 1.94,
1520
+ "grad_norm": 290.4438144891366,
1521
+ "learning_rate": 1.6638086480134954e-06,
1522
+ "logits/chosen": -2.2493913173675537,
1523
+ "logits/rejected": -2.2027366161346436,
1524
+ "logps/chosen": -208.2427520751953,
1525
+ "logps/rejected": -294.7981872558594,
1526
+ "loss": 0.3929,
1527
+ "positive_losses": 4.185807228088379,
1528
+ "rewards/accuracies": 0.949999988079071,
1529
+ "rewards/chosen": 0.2872256338596344,
1530
+ "rewards/margins": 1.9401639699935913,
1531
+ "rewards/margins_max": 2.823594331741333,
1532
+ "rewards/margins_min": 1.0567338466644287,
1533
+ "rewards/margins_std": 1.249358892440796,
1534
+ "rewards/rejected": -1.6529382467269897,
1535
+ "step": 690
1536
+ },
1537
+ {
1538
+ "dpo_losses": 0.11677996069192886,
1539
+ "epoch": 1.97,
1540
+ "grad_norm": 2.93724058913164,
1541
+ "learning_rate": 1.5870108593710473e-06,
1542
+ "logits/chosen": -2.169877290725708,
1543
+ "logits/rejected": -2.062708854675293,
1544
+ "logps/chosen": -379.1634826660156,
1545
+ "logps/rejected": -440.5103454589844,
1546
+ "loss": 0.3314,
1547
+ "positive_losses": 0.9755552411079407,
1548
+ "rewards/accuracies": 1.0,
1549
+ "rewards/chosen": 0.478015273809433,
1550
+ "rewards/margins": 2.7329649925231934,
1551
+ "rewards/margins_max": 3.358954906463623,
1552
+ "rewards/margins_min": 2.1069746017456055,
1553
+ "rewards/margins_std": 0.8852837681770325,
1554
+ "rewards/rejected": -2.2549493312835693,
1555
+ "step": 700
1556
+ },
1557
+ {
1558
+ "epoch": 1.97,
1559
+ "eval_dpo_losses": 0.6525446176528931,
1560
+ "eval_logits/chosen": -2.343222141265869,
1561
+ "eval_logits/rejected": -2.30248761177063,
1562
+ "eval_logps/chosen": -391.17047119140625,
1563
+ "eval_logps/rejected": -410.6868591308594,
1564
+ "eval_loss": 11.165140151977539,
1565
+ "eval_positive_losses": 107.29693603515625,
1566
+ "eval_rewards/accuracies": 0.6626983880996704,
1567
+ "eval_rewards/chosen": -1.0594924688339233,
1568
+ "eval_rewards/margins": 0.4555494785308838,
1569
+ "eval_rewards/margins_max": 1.7776461839675903,
1570
+ "eval_rewards/margins_min": -0.8450111150741577,
1571
+ "eval_rewards/margins_std": 1.16599440574646,
1572
+ "eval_rewards/rejected": -1.5150419473648071,
1573
+ "eval_runtime": 281.451,
1574
+ "eval_samples_per_second": 7.106,
1575
+ "eval_steps_per_second": 0.224,
1576
+ "step": 700
1577
+ },
1578
+ {
1579
+ "dpo_losses": 0.2008899748325348,
1580
+ "epoch": 2.0,
1581
+ "grad_norm": 1.5771255349778672,
1582
+ "learning_rate": 1.511194808315853e-06,
1583
+ "logits/chosen": -2.2691843509674072,
1584
+ "logits/rejected": -2.16029691696167,
1585
+ "logps/chosen": -264.2540588378906,
1586
+ "logps/rejected": -368.97308349609375,
1587
+ "loss": 0.2577,
1588
+ "positive_losses": 1.1838890314102173,
1589
+ "rewards/accuracies": 1.0,
1590
+ "rewards/chosen": 0.4201991558074951,
1591
+ "rewards/margins": 2.272444248199463,
1592
+ "rewards/margins_max": 3.1316885948181152,
1593
+ "rewards/margins_min": 1.413199782371521,
1594
+ "rewards/margins_std": 1.2151552438735962,
1595
+ "rewards/rejected": -1.8522450923919678,
1596
+ "step": 710
1597
+ },
1598
+ {
1599
+ "dpo_losses": 0.11133086681365967,
1600
+ "epoch": 2.03,
1601
+ "grad_norm": 39.920300238021134,
1602
+ "learning_rate": 1.4364420198778662e-06,
1603
+ "logits/chosen": -2.3695783615112305,
1604
+ "logits/rejected": -2.292491912841797,
1605
+ "logps/chosen": -303.01556396484375,
1606
+ "logps/rejected": -584.0543212890625,
1607
+ "loss": 0.2033,
1608
+ "positive_losses": 1.3720115423202515,
1609
+ "rewards/accuracies": 1.0,
1610
+ "rewards/chosen": 0.44599366188049316,
1611
+ "rewards/margins": 2.9826126098632812,
1612
+ "rewards/margins_max": 3.897780179977417,
1613
+ "rewards/margins_min": 2.0674448013305664,
1614
+ "rewards/margins_std": 1.2942426204681396,
1615
+ "rewards/rejected": -2.536618709564209,
1616
+ "step": 720
1617
+ },
1618
+ {
1619
+ "dpo_losses": 0.06813247501850128,
1620
+ "epoch": 2.06,
1621
+ "grad_norm": 61.35667528119903,
1622
+ "learning_rate": 1.3628328757603243e-06,
1623
+ "logits/chosen": -2.3125011920928955,
1624
+ "logits/rejected": -2.234978437423706,
1625
+ "logps/chosen": -380.5849609375,
1626
+ "logps/rejected": -546.2708129882812,
1627
+ "loss": 0.2931,
1628
+ "positive_losses": 3.713000535964966,
1629
+ "rewards/accuracies": 1.0,
1630
+ "rewards/chosen": 0.42165422439575195,
1631
+ "rewards/margins": 3.356724500656128,
1632
+ "rewards/margins_max": 3.9914348125457764,
1633
+ "rewards/margins_min": 2.7220141887664795,
1634
+ "rewards/margins_std": 0.8976157903671265,
1635
+ "rewards/rejected": -2.935070276260376,
1636
+ "step": 730
1637
+ },
1638
+ {
1639
+ "dpo_losses": 0.1471334546804428,
1640
+ "epoch": 2.08,
1641
+ "grad_norm": 81.02956274757072,
1642
+ "learning_rate": 1.2904465279052725e-06,
1643
+ "logits/chosen": -2.3687386512756348,
1644
+ "logits/rejected": -2.246112585067749,
1645
+ "logps/chosen": -307.13690185546875,
1646
+ "logps/rejected": -456.7256774902344,
1647
+ "loss": 0.2511,
1648
+ "positive_losses": 0.0,
1649
+ "rewards/accuracies": 1.0,
1650
+ "rewards/chosen": 0.44816774129867554,
1651
+ "rewards/margins": 2.7628419399261475,
1652
+ "rewards/margins_max": 3.8002758026123047,
1653
+ "rewards/margins_min": 1.7254081964492798,
1654
+ "rewards/margins_std": 1.4671531915664673,
1655
+ "rewards/rejected": -2.314674139022827,
1656
+ "step": 740
1657
+ },
1658
+ {
1659
+ "dpo_losses": 0.18885108828544617,
1660
+ "epoch": 2.11,
1661
+ "grad_norm": 37.34403765579587,
1662
+ "learning_rate": 1.219360813381446e-06,
1663
+ "logits/chosen": -2.1814396381378174,
1664
+ "logits/rejected": -2.133742570877075,
1665
+ "logps/chosen": -167.7379913330078,
1666
+ "logps/rejected": -342.445068359375,
1667
+ "loss": 0.2246,
1668
+ "positive_losses": 3.9296765327453613,
1669
+ "rewards/accuracies": 0.949999988079071,
1670
+ "rewards/chosen": 0.2083771526813507,
1671
+ "rewards/margins": 2.266416072845459,
1672
+ "rewards/margins_max": 3.0310301780700684,
1673
+ "rewards/margins_min": 1.501802682876587,
1674
+ "rewards/margins_std": 1.0813268423080444,
1675
+ "rewards/rejected": -2.0580391883850098,
1676
+ "step": 750
1677
+ },
1678
+ {
1679
+ "dpo_losses": 0.07930545508861542,
1680
+ "epoch": 2.14,
1681
+ "grad_norm": 53.31665337719304,
1682
+ "learning_rate": 1.1496521706860392e-06,
1683
+ "logits/chosen": -2.3485515117645264,
1684
+ "logits/rejected": -2.2168877124786377,
1685
+ "logps/chosen": -298.58966064453125,
1686
+ "logps/rejected": -549.2097778320312,
1687
+ "loss": 0.2258,
1688
+ "positive_losses": 3.5301315784454346,
1689
+ "rewards/accuracies": 1.0,
1690
+ "rewards/chosen": 0.344715416431427,
1691
+ "rewards/margins": 3.2088115215301514,
1692
+ "rewards/margins_max": 4.005995750427246,
1693
+ "rewards/margins_min": 2.411628246307373,
1694
+ "rewards/margins_std": 1.1273881196975708,
1695
+ "rewards/rejected": -2.8640968799591064,
1696
+ "step": 760
1697
+ },
1698
+ {
1699
+ "dpo_losses": 0.13964664936065674,
1700
+ "epoch": 2.17,
1701
+ "grad_norm": 1.2653188452887754,
1702
+ "learning_rate": 1.0813955575503588e-06,
1703
+ "logits/chosen": -2.278029680252075,
1704
+ "logits/rejected": -2.2471253871917725,
1705
+ "logps/chosen": -296.67205810546875,
1706
+ "logps/rejected": -532.7224731445312,
1707
+ "loss": 0.2746,
1708
+ "positive_losses": 1.8862769603729248,
1709
+ "rewards/accuracies": 1.0,
1710
+ "rewards/chosen": 0.36964336037635803,
1711
+ "rewards/margins": 3.1202445030212402,
1712
+ "rewards/margins_max": 4.3073296546936035,
1713
+ "rewards/margins_min": 1.9331591129302979,
1714
+ "rewards/margins_std": 1.6787922382354736,
1715
+ "rewards/rejected": -2.750600814819336,
1716
+ "step": 770
1717
+ },
1718
+ {
1719
+ "dpo_losses": 0.18134805560112,
1720
+ "epoch": 2.2,
1721
+ "grad_norm": 49.5060636871573,
1722
+ "learning_rate": 1.0146643703377488e-06,
1723
+ "logits/chosen": -2.3735833168029785,
1724
+ "logits/rejected": -2.250998020172119,
1725
+ "logps/chosen": -292.6219177246094,
1726
+ "logps/rejected": -449.14068603515625,
1727
+ "loss": 0.2041,
1728
+ "positive_losses": 1.1141910552978516,
1729
+ "rewards/accuracies": 1.0,
1730
+ "rewards/chosen": 0.3442000448703766,
1731
+ "rewards/margins": 2.5554795265197754,
1732
+ "rewards/margins_max": 3.6242382526397705,
1733
+ "rewards/margins_min": 1.4867204427719116,
1734
+ "rewards/margins_std": 1.51145339012146,
1735
+ "rewards/rejected": -2.2112793922424316,
1736
+ "step": 780
1737
+ },
1738
+ {
1739
+ "dpo_losses": 0.09527120739221573,
1740
+ "epoch": 2.23,
1741
+ "grad_norm": 22.61766042791389,
1742
+ "learning_rate": 9.495303651204496e-07,
1743
+ "logits/chosen": -2.317281484603882,
1744
+ "logits/rejected": -2.2382774353027344,
1745
+ "logps/chosen": -320.7598571777344,
1746
+ "logps/rejected": -518.2537841796875,
1747
+ "loss": 0.1938,
1748
+ "positive_losses": 0.0011909485328942537,
1749
+ "rewards/accuracies": 1.0,
1750
+ "rewards/chosen": 0.4521896243095398,
1751
+ "rewards/margins": 2.8671340942382812,
1752
+ "rewards/margins_max": 3.515406847000122,
1753
+ "rewards/margins_min": 2.2188615798950195,
1754
+ "rewards/margins_std": 0.9167959094047546,
1755
+ "rewards/rejected": -2.414944648742676,
1756
+ "step": 790
1757
+ },
1758
+ {
1759
+ "dpo_losses": 0.09652134031057358,
1760
+ "epoch": 2.25,
1761
+ "grad_norm": 55.89571807239017,
1762
+ "learning_rate": 8.860635805202616e-07,
1763
+ "logits/chosen": -2.334362506866455,
1764
+ "logits/rejected": -2.2386062145233154,
1765
+ "logps/chosen": -323.18060302734375,
1766
+ "logps/rejected": -481.80712890625,
1767
+ "loss": 0.1352,
1768
+ "positive_losses": 0.3718675673007965,
1769
+ "rewards/accuracies": 1.0,
1770
+ "rewards/chosen": 0.4021781086921692,
1771
+ "rewards/margins": 2.7669034004211426,
1772
+ "rewards/margins_max": 3.376081943511963,
1773
+ "rewards/margins_min": 2.1577250957489014,
1774
+ "rewards/margins_std": 0.8615081906318665,
1775
+ "rewards/rejected": -2.364725112915039,
1776
+ "step": 800
1777
+ },
1778
+ {
1779
+ "epoch": 2.25,
1780
+ "eval_dpo_losses": 0.6700040102005005,
1781
+ "eval_logits/chosen": -2.3320279121398926,
1782
+ "eval_logits/rejected": -2.290130376815796,
1783
+ "eval_logps/chosen": -415.07861328125,
1784
+ "eval_logps/rejected": -441.023681640625,
1785
+ "eval_loss": 13.35706615447998,
1786
+ "eval_positive_losses": 130.9070281982422,
1787
+ "eval_rewards/accuracies": 0.6626983880996704,
1788
+ "eval_rewards/chosen": -1.2985737323760986,
1789
+ "eval_rewards/margins": 0.5198364853858948,
1790
+ "eval_rewards/margins_max": 2.022522211074829,
1791
+ "eval_rewards/margins_min": -0.9602744579315186,
1792
+ "eval_rewards/margins_std": 1.3295913934707642,
1793
+ "eval_rewards/rejected": -1.8184101581573486,
1794
+ "eval_runtime": 281.6798,
1795
+ "eval_samples_per_second": 7.1,
1796
+ "eval_steps_per_second": 0.224,
1797
+ "step": 800
1798
+ },
1799
+ {
1800
+ "dpo_losses": 0.11107480525970459,
1801
+ "epoch": 2.28,
1802
+ "grad_norm": 1.211879634445009,
1803
+ "learning_rate": 8.24332262395994e-07,
1804
+ "logits/chosen": -2.4299230575561523,
1805
+ "logits/rejected": -2.3509979248046875,
1806
+ "logps/chosen": -259.64581298828125,
1807
+ "logps/rejected": -510.4208984375,
1808
+ "loss": 0.4972,
1809
+ "positive_losses": 1.13177490234375,
1810
+ "rewards/accuracies": 1.0,
1811
+ "rewards/chosen": 0.34921571612358093,
1812
+ "rewards/margins": 3.014543056488037,
1813
+ "rewards/margins_max": 4.297389984130859,
1814
+ "rewards/margins_min": 1.7316957712173462,
1815
+ "rewards/margins_std": 1.8142198324203491,
1816
+ "rewards/rejected": -2.665327310562134,
1817
+ "step": 810
1818
+ },
1819
+ {
1820
+ "dpo_losses": 0.15867747366428375,
1821
+ "epoch": 2.31,
1822
+ "grad_norm": 1.9974082008055192,
1823
+ "learning_rate": 7.644027904586587e-07,
1824
+ "logits/chosen": -2.368762731552124,
1825
+ "logits/rejected": -2.2775216102600098,
1826
+ "logps/chosen": -270.0508728027344,
1827
+ "logps/rejected": -446.8939514160156,
1828
+ "loss": 0.3191,
1829
+ "positive_losses": 0.852569580078125,
1830
+ "rewards/accuracies": 1.0,
1831
+ "rewards/chosen": 0.3547489047050476,
1832
+ "rewards/margins": 2.747641086578369,
1833
+ "rewards/margins_max": 3.639145612716675,
1834
+ "rewards/margins_min": 1.8561369180679321,
1835
+ "rewards/margins_std": 1.2607777118682861,
1836
+ "rewards/rejected": -2.392892360687256,
1837
+ "step": 820
1838
+ },
1839
+ {
1840
+ "dpo_losses": 0.06032683700323105,
1841
+ "epoch": 2.34,
1842
+ "grad_norm": 79.25640130890059,
1843
+ "learning_rate": 7.06339606893347e-07,
1844
+ "logits/chosen": -2.3674511909484863,
1845
+ "logits/rejected": -2.254099130630493,
1846
+ "logps/chosen": -410.2024841308594,
1847
+ "logps/rejected": -582.8692016601562,
1848
+ "loss": 0.212,
1849
+ "positive_losses": 3.602843761444092,
1850
+ "rewards/accuracies": 1.0,
1851
+ "rewards/chosen": 0.4369390904903412,
1852
+ "rewards/margins": 3.7233974933624268,
1853
+ "rewards/margins_max": 4.673032283782959,
1854
+ "rewards/margins_min": 2.7737619876861572,
1855
+ "rewards/margins_std": 1.3429871797561646,
1856
+ "rewards/rejected": -3.2864582538604736,
1857
+ "step": 830
1858
+ },
1859
+ {
1860
+ "dpo_losses": 0.12710335850715637,
1861
+ "epoch": 2.37,
1862
+ "grad_norm": 5.277843753781562,
1863
+ "learning_rate": 6.502051470645149e-07,
1864
+ "logits/chosen": -2.486243486404419,
1865
+ "logits/rejected": -2.29672908782959,
1866
+ "logps/chosen": -348.5108947753906,
1867
+ "logps/rejected": -525.3292846679688,
1868
+ "loss": 0.346,
1869
+ "positive_losses": 0.0202178955078125,
1870
+ "rewards/accuracies": 1.0,
1871
+ "rewards/chosen": 0.4295215606689453,
1872
+ "rewards/margins": 2.8306221961975098,
1873
+ "rewards/margins_max": 3.4442856311798096,
1874
+ "rewards/margins_min": 2.216958522796631,
1875
+ "rewards/margins_std": 0.8678513765335083,
1876
+ "rewards/rejected": -2.4011006355285645,
1877
+ "step": 840
1878
+ },
1879
+ {
1880
+ "dpo_losses": 0.10362281650304794,
1881
+ "epoch": 2.39,
1882
+ "grad_norm": 172.11522416865105,
1883
+ "learning_rate": 5.960597723792194e-07,
1884
+ "logits/chosen": -2.351360559463501,
1885
+ "logits/rejected": -2.166510581970215,
1886
+ "logps/chosen": -337.3572692871094,
1887
+ "logps/rejected": -561.3741455078125,
1888
+ "loss": 0.2227,
1889
+ "positive_losses": 0.215586856007576,
1890
+ "rewards/accuracies": 1.0,
1891
+ "rewards/chosen": 0.47708138823509216,
1892
+ "rewards/margins": 3.5102126598358154,
1893
+ "rewards/margins_max": 4.734680652618408,
1894
+ "rewards/margins_min": 2.28574538230896,
1895
+ "rewards/margins_std": 1.731658697128296,
1896
+ "rewards/rejected": -3.0331313610076904,
1897
+ "step": 850
1898
+ },
1899
+ {
1900
+ "dpo_losses": 0.10238895565271378,
1901
+ "epoch": 2.42,
1902
+ "grad_norm": 82.67868906193421,
1903
+ "learning_rate": 5.43961705380465e-07,
1904
+ "logits/chosen": -2.389751434326172,
1905
+ "logits/rejected": -2.2885704040527344,
1906
+ "logps/chosen": -315.6968078613281,
1907
+ "logps/rejected": -601.2724609375,
1908
+ "loss": 0.2321,
1909
+ "positive_losses": 0.0,
1910
+ "rewards/accuracies": 0.949999988079071,
1911
+ "rewards/chosen": 0.3972366154193878,
1912
+ "rewards/margins": 3.726912260055542,
1913
+ "rewards/margins_max": 4.830050945281982,
1914
+ "rewards/margins_min": 2.6237740516662598,
1915
+ "rewards/margins_std": 1.5600733757019043,
1916
+ "rewards/rejected": -3.3296761512756348,
1917
+ "step": 860
1918
+ },
1919
+ {
1920
+ "dpo_losses": 0.10260417312383652,
1921
+ "epoch": 2.45,
1922
+ "grad_norm": 24.22347862103844,
1923
+ "learning_rate": 4.939669671404871e-07,
1924
+ "logits/chosen": -2.3038783073425293,
1925
+ "logits/rejected": -2.185800552368164,
1926
+ "logps/chosen": -305.2945556640625,
1927
+ "logps/rejected": -611.5335693359375,
1928
+ "loss": 0.1672,
1929
+ "positive_losses": 0.3565734922885895,
1930
+ "rewards/accuracies": 1.0,
1931
+ "rewards/chosen": 0.35259321331977844,
1932
+ "rewards/margins": 3.309342861175537,
1933
+ "rewards/margins_max": 4.305205345153809,
1934
+ "rewards/margins_min": 2.313480854034424,
1935
+ "rewards/margins_std": 1.4083621501922607,
1936
+ "rewards/rejected": -2.956749439239502,
1937
+ "step": 870
1938
+ },
1939
+ {
1940
+ "dpo_losses": 0.0972672775387764,
1941
+ "epoch": 2.48,
1942
+ "grad_norm": 2.5325654932498978,
1943
+ "learning_rate": 4.461293170212644e-07,
1944
+ "logits/chosen": -2.366685390472412,
1945
+ "logits/rejected": -2.244584798812866,
1946
+ "logps/chosen": -284.2767028808594,
1947
+ "logps/rejected": -525.98681640625,
1948
+ "loss": 0.1957,
1949
+ "positive_losses": 4.633613586425781,
1950
+ "rewards/accuracies": 1.0,
1951
+ "rewards/chosen": 0.3293871283531189,
1952
+ "rewards/margins": 3.1354575157165527,
1953
+ "rewards/margins_max": 4.131924629211426,
1954
+ "rewards/margins_min": 2.138990640640259,
1955
+ "rewards/margins_std": 1.409217119216919,
1956
+ "rewards/rejected": -2.806070327758789,
1957
+ "step": 880
1958
+ },
1959
+ {
1960
+ "dpo_losses": 0.07743240892887115,
1961
+ "epoch": 2.51,
1962
+ "grad_norm": 246.3549744525164,
1963
+ "learning_rate": 4.005001948670606e-07,
1964
+ "logits/chosen": -2.3962912559509277,
1965
+ "logits/rejected": -2.299287796020508,
1966
+ "logps/chosen": -425.3636169433594,
1967
+ "logps/rejected": -637.3418579101562,
1968
+ "loss": 0.2586,
1969
+ "positive_losses": 1.6243082284927368,
1970
+ "rewards/accuracies": 1.0,
1971
+ "rewards/chosen": 0.5158067941665649,
1972
+ "rewards/margins": 3.6285767555236816,
1973
+ "rewards/margins_max": 4.66379451751709,
1974
+ "rewards/margins_min": 2.593358278274536,
1975
+ "rewards/margins_std": 1.464019536972046,
1976
+ "rewards/rejected": -3.112769603729248,
1977
+ "step": 890
1978
+ },
1979
+ {
1980
+ "dpo_losses": 0.0913797914981842,
1981
+ "epoch": 2.54,
1982
+ "grad_norm": 31.034651555446352,
1983
+ "learning_rate": 3.571286656911377e-07,
1984
+ "logits/chosen": -2.356581211090088,
1985
+ "logits/rejected": -2.1674458980560303,
1986
+ "logps/chosen": -353.3656005859375,
1987
+ "logps/rejected": -581.6492309570312,
1988
+ "loss": 0.2348,
1989
+ "positive_losses": 2.3227431774139404,
1990
+ "rewards/accuracies": 1.0,
1991
+ "rewards/chosen": 0.46973830461502075,
1992
+ "rewards/margins": 3.632810115814209,
1993
+ "rewards/margins_max": 5.023132801055908,
1994
+ "rewards/margins_min": 2.242488145828247,
1995
+ "rewards/margins_std": 1.966212511062622,
1996
+ "rewards/rejected": -3.163072109222412,
1997
+ "step": 900
1998
+ },
1999
+ {
2000
+ "epoch": 2.54,
2001
+ "eval_dpo_losses": 0.6903655529022217,
2002
+ "eval_logits/chosen": -2.3085381984710693,
2003
+ "eval_logits/rejected": -2.2660651206970215,
2004
+ "eval_logps/chosen": -430.1044006347656,
2005
+ "eval_logps/rejected": -459.71075439453125,
2006
+ "eval_loss": 14.72413444519043,
2007
+ "eval_positive_losses": 145.90809631347656,
2008
+ "eval_rewards/accuracies": 0.670634925365448,
2009
+ "eval_rewards/chosen": -1.4488320350646973,
2010
+ "eval_rewards/margins": 0.5564488768577576,
2011
+ "eval_rewards/margins_max": 2.180058240890503,
2012
+ "eval_rewards/margins_min": -1.0958278179168701,
2013
+ "eval_rewards/margins_std": 1.4586230516433716,
2014
+ "eval_rewards/rejected": -2.0052807331085205,
2015
+ "eval_runtime": 280.6688,
2016
+ "eval_samples_per_second": 7.126,
2017
+ "eval_steps_per_second": 0.224,
2018
+ "step": 900
2019
+ },
2020
+ {
2021
+ "dpo_losses": 0.10229980945587158,
2022
+ "epoch": 2.56,
2023
+ "grad_norm": 1.3207112449434741,
2024
+ "learning_rate": 3.1606136691612555e-07,
2025
+ "logits/chosen": -2.450596809387207,
2026
+ "logits/rejected": -2.320343255996704,
2027
+ "logps/chosen": -373.2110900878906,
2028
+ "logps/rejected": -512.347412109375,
2029
+ "loss": 0.2148,
2030
+ "positive_losses": 0.0,
2031
+ "rewards/accuracies": 1.0,
2032
+ "rewards/chosen": 0.45970940589904785,
2033
+ "rewards/margins": 3.199089527130127,
2034
+ "rewards/margins_max": 4.215831756591797,
2035
+ "rewards/margins_min": 2.182347297668457,
2036
+ "rewards/margins_std": 1.4378905296325684,
2037
+ "rewards/rejected": -2.739380121231079,
2038
+ "step": 910
2039
+ },
2040
+ {
2041
+ "dpo_losses": 0.10751441866159439,
2042
+ "epoch": 2.59,
2043
+ "grad_norm": 50.58623371052272,
2044
+ "learning_rate": 2.773424582247844e-07,
2045
+ "logits/chosen": -2.3134539127349854,
2046
+ "logits/rejected": -2.1395092010498047,
2047
+ "logps/chosen": -311.353271484375,
2048
+ "logps/rejected": -480.4183654785156,
2049
+ "loss": 0.1833,
2050
+ "positive_losses": 0.0,
2051
+ "rewards/accuracies": 1.0,
2052
+ "rewards/chosen": 0.4951690137386322,
2053
+ "rewards/margins": 3.3741965293884277,
2054
+ "rewards/margins_max": 4.4515910148620605,
2055
+ "rewards/margins_min": 2.296802043914795,
2056
+ "rewards/margins_std": 1.5236659049987793,
2057
+ "rewards/rejected": -2.8790273666381836,
2058
+ "step": 920
2059
+ },
2060
+ {
2061
+ "dpo_losses": 0.08198712766170502,
2062
+ "epoch": 2.62,
2063
+ "grad_norm": 34.59409580902137,
2064
+ "learning_rate": 2.410135740750821e-07,
2065
+ "logits/chosen": -2.3756449222564697,
2066
+ "logits/rejected": -2.2801589965820312,
2067
+ "logps/chosen": -309.6001892089844,
2068
+ "logps/rejected": -594.0984497070312,
2069
+ "loss": 0.2129,
2070
+ "positive_losses": 0.7936180233955383,
2071
+ "rewards/accuracies": 1.0,
2072
+ "rewards/chosen": 0.37005850672721863,
2073
+ "rewards/margins": 3.6145331859588623,
2074
+ "rewards/margins_max": 4.759096145629883,
2075
+ "rewards/margins_min": 2.4699695110321045,
2076
+ "rewards/margins_std": 1.618657112121582,
2077
+ "rewards/rejected": -3.244474411010742,
2078
+ "step": 930
2079
+ },
2080
+ {
2081
+ "dpo_losses": 0.18878915905952454,
2082
+ "epoch": 2.65,
2083
+ "grad_norm": 127.23596281859544,
2084
+ "learning_rate": 2.0711377893064182e-07,
2085
+ "logits/chosen": -2.362793207168579,
2086
+ "logits/rejected": -2.2321505546569824,
2087
+ "logps/chosen": -313.73736572265625,
2088
+ "logps/rejected": -428.6400451660156,
2089
+ "loss": 0.2036,
2090
+ "positive_losses": 2.974576711654663,
2091
+ "rewards/accuracies": 0.949999988079071,
2092
+ "rewards/chosen": 0.2547670602798462,
2093
+ "rewards/margins": 2.439239978790283,
2094
+ "rewards/margins_max": 3.555748701095581,
2095
+ "rewards/margins_min": 1.3227306604385376,
2096
+ "rewards/margins_std": 1.5789823532104492,
2097
+ "rewards/rejected": -2.1844725608825684,
2098
+ "step": 940
2099
+ },
2100
+ {
2101
+ "dpo_losses": 0.1424863040447235,
2102
+ "epoch": 2.68,
2103
+ "grad_norm": 3.7675671139759395,
2104
+ "learning_rate": 1.756795252547111e-07,
2105
+ "logits/chosen": -2.324215888977051,
2106
+ "logits/rejected": -2.20219087600708,
2107
+ "logps/chosen": -260.1328430175781,
2108
+ "logps/rejected": -470.79412841796875,
2109
+ "loss": 0.3619,
2110
+ "positive_losses": 0.20402908325195312,
2111
+ "rewards/accuracies": 1.0,
2112
+ "rewards/chosen": 0.42012229561805725,
2113
+ "rewards/margins": 3.099902629852295,
2114
+ "rewards/margins_max": 4.072454452514648,
2115
+ "rewards/margins_min": 2.1273510456085205,
2116
+ "rewards/margins_std": 1.3753960132598877,
2117
+ "rewards/rejected": -2.6797804832458496,
2118
+ "step": 950
2119
+ },
2120
+ {
2121
+ "dpo_losses": 0.10174594074487686,
2122
+ "epoch": 2.7,
2123
+ "grad_norm": 1.8085505107547237,
2124
+ "learning_rate": 1.4674461431281013e-07,
2125
+ "logits/chosen": -2.525442361831665,
2126
+ "logits/rejected": -2.4085853099823,
2127
+ "logps/chosen": -290.04345703125,
2128
+ "logps/rejected": -531.1943969726562,
2129
+ "loss": 0.3038,
2130
+ "positive_losses": 0.1788475066423416,
2131
+ "rewards/accuracies": 1.0,
2132
+ "rewards/chosen": 0.3685051202774048,
2133
+ "rewards/margins": 3.2411983013153076,
2134
+ "rewards/margins_max": 4.235571384429932,
2135
+ "rewards/margins_min": 2.246825695037842,
2136
+ "rewards/margins_std": 1.4062554836273193,
2137
+ "rewards/rejected": -2.8726933002471924,
2138
+ "step": 960
2139
+ },
2140
+ {
2141
+ "dpo_losses": 0.09436773508787155,
2142
+ "epoch": 2.73,
2143
+ "grad_norm": 38.21579384775439,
2144
+ "learning_rate": 1.2034015982622243e-07,
2145
+ "logits/chosen": -2.403714418411255,
2146
+ "logits/rejected": -2.282444477081299,
2147
+ "logps/chosen": -336.4191589355469,
2148
+ "logps/rejected": -650.2848510742188,
2149
+ "loss": 0.2727,
2150
+ "positive_losses": 4.7762908935546875,
2151
+ "rewards/accuracies": 1.0,
2152
+ "rewards/chosen": 0.38717547059059143,
2153
+ "rewards/margins": 3.8552498817443848,
2154
+ "rewards/margins_max": 5.095088481903076,
2155
+ "rewards/margins_min": 2.615410327911377,
2156
+ "rewards/margins_std": 1.7533977031707764,
2157
+ "rewards/rejected": -3.468074083328247,
2158
+ "step": 970
2159
+ },
2160
+ {
2161
+ "dpo_losses": 0.0995684489607811,
2162
+ "epoch": 2.76,
2163
+ "grad_norm": 34.214640164153415,
2164
+ "learning_rate": 9.649455451539419e-08,
2165
+ "logits/chosen": -2.195786714553833,
2166
+ "logits/rejected": -2.1557459831237793,
2167
+ "logps/chosen": -205.28829956054688,
2168
+ "logps/rejected": -449.69903564453125,
2169
+ "loss": 0.362,
2170
+ "positive_losses": 1.5731815099716187,
2171
+ "rewards/accuracies": 0.949999988079071,
2172
+ "rewards/chosen": 0.3235073983669281,
2173
+ "rewards/margins": 3.076923131942749,
2174
+ "rewards/margins_max": 3.9705424308776855,
2175
+ "rewards/margins_min": 2.1833040714263916,
2176
+ "rewards/margins_std": 1.2637684345245361,
2177
+ "rewards/rejected": -2.753415584564209,
2178
+ "step": 980
2179
+ },
2180
+ {
2181
+ "dpo_losses": 0.09841219335794449,
2182
+ "epoch": 2.79,
2183
+ "grad_norm": 3.551041227418815,
2184
+ "learning_rate": 7.523343956923196e-08,
2185
+ "logits/chosen": -2.4305670261383057,
2186
+ "logits/rejected": -2.3597071170806885,
2187
+ "logps/chosen": -302.9432678222656,
2188
+ "logps/rejected": -600.987060546875,
2189
+ "loss": 0.2684,
2190
+ "positive_losses": 1.013157606124878,
2191
+ "rewards/accuracies": 1.0,
2192
+ "rewards/chosen": 0.49134930968284607,
2193
+ "rewards/margins": 3.6477818489074707,
2194
+ "rewards/margins_max": 4.790149688720703,
2195
+ "rewards/margins_min": 2.5054140090942383,
2196
+ "rewards/margins_std": 1.6155517101287842,
2197
+ "rewards/rejected": -3.156432628631592,
2198
+ "step": 990
2199
+ },
2200
+ {
2201
+ "dpo_losses": 0.11016283929347992,
2202
+ "epoch": 2.82,
2203
+ "grad_norm": 17.885804652199482,
2204
+ "learning_rate": 5.657967707312195e-08,
2205
+ "logits/chosen": -2.2564501762390137,
2206
+ "logits/rejected": -2.223057985305786,
2207
+ "logps/chosen": -243.24911499023438,
2208
+ "logps/rejected": -581.9591064453125,
2209
+ "loss": 0.1369,
2210
+ "positive_losses": 0.5390418171882629,
2211
+ "rewards/accuracies": 1.0,
2212
+ "rewards/chosen": 0.3867679238319397,
2213
+ "rewards/margins": 3.3442111015319824,
2214
+ "rewards/margins_max": 4.438827991485596,
2215
+ "rewards/margins_min": 2.2495944499969482,
2216
+ "rewards/margins_std": 1.5480217933654785,
2217
+ "rewards/rejected": -2.9574437141418457,
2218
+ "step": 1000
2219
+ },
2220
+ {
2221
+ "epoch": 2.82,
2222
+ "eval_dpo_losses": 0.6868842244148254,
2223
+ "eval_logits/chosen": -2.3165292739868164,
2224
+ "eval_logits/rejected": -2.2738356590270996,
2225
+ "eval_logps/chosen": -428.1341857910156,
2226
+ "eval_logps/rejected": -461.68865966796875,
2227
+ "eval_loss": 14.59554386138916,
2228
+ "eval_positive_losses": 143.93893432617188,
2229
+ "eval_rewards/accuracies": 0.6626983880996704,
2230
+ "eval_rewards/chosen": -1.4291293621063232,
2231
+ "eval_rewards/margins": 0.5959304571151733,
2232
+ "eval_rewards/margins_max": 2.2952685356140137,
2233
+ "eval_rewards/margins_min": -1.107314109802246,
2234
+ "eval_rewards/margins_std": 1.5052008628845215,
2235
+ "eval_rewards/rejected": -2.025059700012207,
2236
+ "eval_runtime": 280.8142,
2237
+ "eval_samples_per_second": 7.122,
2238
+ "eval_steps_per_second": 0.224,
2239
+ "step": 1000
2240
+ },
2241
+ {
2242
+ "dpo_losses": 0.13428188860416412,
2243
+ "epoch": 2.85,
2244
+ "grad_norm": 223.7670035438648,
2245
+ "learning_rate": 4.055332542531959e-08,
2246
+ "logits/chosen": -2.418339490890503,
2247
+ "logits/rejected": -2.3237392902374268,
2248
+ "logps/chosen": -274.25958251953125,
2249
+ "logps/rejected": -552.5701904296875,
2250
+ "loss": 0.4402,
2251
+ "positive_losses": 0.35679930448532104,
2252
+ "rewards/accuracies": 1.0,
2253
+ "rewards/chosen": 0.41407138109207153,
2254
+ "rewards/margins": 3.3448989391326904,
2255
+ "rewards/margins_max": 4.5948896408081055,
2256
+ "rewards/margins_min": 2.094907522201538,
2257
+ "rewards/margins_std": 1.7677549123764038,
2258
+ "rewards/rejected": -2.9308273792266846,
2259
+ "step": 1010
2260
+ },
2261
+ {
2262
+ "dpo_losses": 0.09611718356609344,
2263
+ "epoch": 2.87,
2264
+ "grad_norm": 4.2164881541545896,
2265
+ "learning_rate": 2.7171617768147472e-08,
2266
+ "logits/chosen": -2.3242077827453613,
2267
+ "logits/rejected": -2.20931339263916,
2268
+ "logps/chosen": -273.04669189453125,
2269
+ "logps/rejected": -532.8912963867188,
2270
+ "loss": 0.2835,
2271
+ "positive_losses": 1.0188411474227905,
2272
+ "rewards/accuracies": 1.0,
2273
+ "rewards/chosen": 0.38434693217277527,
2274
+ "rewards/margins": 3.382856845855713,
2275
+ "rewards/margins_max": 4.1625471115112305,
2276
+ "rewards/margins_min": 2.6031665802001953,
2277
+ "rewards/margins_std": 1.1026487350463867,
2278
+ "rewards/rejected": -2.9985098838806152,
2279
+ "step": 1020
2280
+ },
2281
+ {
2282
+ "dpo_losses": 0.09702328592538834,
2283
+ "epoch": 2.9,
2284
+ "grad_norm": 37.444607526485036,
2285
+ "learning_rate": 1.6448943457189616e-08,
2286
+ "logits/chosen": -2.3770041465759277,
2287
+ "logits/rejected": -2.312561511993408,
2288
+ "logps/chosen": -323.7328186035156,
2289
+ "logps/rejected": -590.1715698242188,
2290
+ "loss": 0.1692,
2291
+ "positive_losses": 0.22979411482810974,
2292
+ "rewards/accuracies": 1.0,
2293
+ "rewards/chosen": 0.4143516421318054,
2294
+ "rewards/margins": 3.550870418548584,
2295
+ "rewards/margins_max": 4.831820487976074,
2296
+ "rewards/margins_min": 2.269920825958252,
2297
+ "rewards/margins_std": 1.8115367889404297,
2298
+ "rewards/rejected": -3.136518955230713,
2299
+ "step": 1030
2300
+ },
2301
+ {
2302
+ "dpo_losses": 0.05908944085240364,
2303
+ "epoch": 2.93,
2304
+ "grad_norm": 4.414870466706013,
2305
+ "learning_rate": 8.39683258841123e-09,
2306
+ "logits/chosen": -2.2308764457702637,
2307
+ "logits/rejected": -2.1003758907318115,
2308
+ "logps/chosen": -295.9768371582031,
2309
+ "logps/rejected": -529.7901611328125,
2310
+ "loss": 0.2377,
2311
+ "positive_losses": 3.635768175125122,
2312
+ "rewards/accuracies": 1.0,
2313
+ "rewards/chosen": 0.5024327039718628,
2314
+ "rewards/margins": 3.5135796070098877,
2315
+ "rewards/margins_max": 4.2515668869018555,
2316
+ "rewards/margins_min": 2.7755913734436035,
2317
+ "rewards/margins_std": 1.0436723232269287,
2318
+ "rewards/rejected": -3.0111465454101562,
2319
+ "step": 1040
2320
+ },
2321
+ {
2322
+ "dpo_losses": 0.13825824856758118,
2323
+ "epoch": 2.96,
2324
+ "grad_norm": 2.4504883804901993,
2325
+ "learning_rate": 3.0239435998430376e-09,
2326
+ "logits/chosen": -2.348167896270752,
2327
+ "logits/rejected": -2.233564615249634,
2328
+ "logps/chosen": -286.636962890625,
2329
+ "logps/rejected": -520.432861328125,
2330
+ "loss": 0.2494,
2331
+ "positive_losses": 3.874138593673706,
2332
+ "rewards/accuracies": 1.0,
2333
+ "rewards/chosen": 0.4494766294956207,
2334
+ "rewards/margins": 3.075880527496338,
2335
+ "rewards/margins_max": 4.244940280914307,
2336
+ "rewards/margins_min": 1.9068210124969482,
2337
+ "rewards/margins_std": 1.6533000469207764,
2338
+ "rewards/rejected": -2.62640380859375,
2339
+ "step": 1050
2340
+ },
2341
+ {
2342
+ "dpo_losses": 0.10627589374780655,
2343
+ "epoch": 2.99,
2344
+ "grad_norm": 2.8525382266880834,
2345
+ "learning_rate": 3.3605396115826695e-10,
2346
+ "logits/chosen": -2.2836787700653076,
2347
+ "logits/rejected": -2.183605432510376,
2348
+ "logps/chosen": -289.5533447265625,
2349
+ "logps/rejected": -472.84375,
2350
+ "loss": 0.3088,
2351
+ "positive_losses": 0.0,
2352
+ "rewards/accuracies": 1.0,
2353
+ "rewards/chosen": 0.3630369305610657,
2354
+ "rewards/margins": 2.99712872505188,
2355
+ "rewards/margins_max": 3.9373364448547363,
2356
+ "rewards/margins_min": 2.0569205284118652,
2357
+ "rewards/margins_std": 1.3296549320220947,
2358
+ "rewards/rejected": -2.63409161567688,
2359
+ "step": 1060
2360
+ },
2361
+ {
2362
+ "epoch": 3.0,
2363
+ "step": 1065,
2364
+ "total_flos": 0.0,
2365
+ "train_loss": 0.36563416943303856,
2366
+ "train_runtime": 9271.2095,
2367
+ "train_samples_per_second": 1.837,
2368
+ "train_steps_per_second": 0.115
2369
+ }
2370
+ ],
2371
+ "logging_steps": 10,
2372
+ "max_steps": 1065,
2373
+ "num_input_tokens_seen": 0,
2374
+ "num_train_epochs": 3,
2375
+ "save_steps": 100,
2376
+ "total_flos": 0.0,
2377
+ "train_batch_size": 2,
2378
+ "trial_name": null,
2379
+ "trial_params": null
2380
+ }