hZzy commited on
Commit
967b64e
1 Parent(s): 9e376f5

Model save

Browse files
README.md ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: hZzy/qwen2.5-0.5b-sft-news-IFT
4
+ tags:
5
+ - trl
6
+ - expo
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: qwen2.5-0.5b-expo-EXDPO-WEIGHT-BETA0.2
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/zhiyuzha-university-of-florida/huggingface/runs/3vfs9qoc)
17
+ # qwen2.5-0.5b-expo-EXDPO-WEIGHT-BETA0.2
18
+
19
+ This model is a fine-tuned version of [hZzy/qwen2.5-0.5b-sft-news-IFT](https://huggingface.co/hZzy/qwen2.5-0.5b-sft-news-IFT) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.3418
22
+ - Logps: -93.8220
23
+ - Logits: -1.1885
24
+ - Objective: 0.3404
25
+ - Dpo Loss: 0.7083
26
+ - Regularize: 0.2696
27
+ - Ranking Simple: 0.5197
28
+ - Ranking Idealized: 0.5399
29
+ - Ranking Idealized Expo: 0.5243
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-07
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 4
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - num_devices: 3
54
+ - gradient_accumulation_steps: 8
55
+ - total_train_batch_size: 96
56
+ - total_eval_batch_size: 12
57
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
58
+ - lr_scheduler_type: cosine
59
+ - lr_scheduler_warmup_ratio: 0.1
60
+ - num_epochs: 10
61
+ - mixed_precision_training: Native AMP
62
+
63
+ ### Training results
64
+
65
+ | Training Loss | Epoch | Step | Validation Loss | Logps | Logits | Objective | Dpo Loss | Regularize | Ranking Simple | Ranking Idealized | Ranking Idealized Expo |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------:|:-------:|:---------:|:--------:|:----------:|:--------------:|:-----------------:|:----------------------:|
67
+ | 0.0798 | 0.0945 | 50 | 0.0807 | -98.5040 | -1.3072 | 0.0808 | 0.6932 | 0.0115 | 0.5238 | 0.5399 | 0.5243 |
68
+ | 0.081 | 0.1889 | 100 | 0.0819 | -98.4932 | -1.3084 | 0.0819 | 0.6934 | 0.0125 | 0.5238 | 0.5399 | 0.5243 |
69
+ | 0.0839 | 0.2834 | 150 | 0.0823 | -98.5417 | -1.3079 | 0.0823 | 0.6931 | 0.0130 | 0.5233 | 0.5399 | 0.5243 |
70
+ | 0.0891 | 0.3779 | 200 | 0.0840 | -98.6517 | -1.3063 | 0.0839 | 0.6927 | 0.0147 | 0.5238 | 0.5399 | 0.5243 |
71
+ | 0.1019 | 0.4724 | 250 | 0.0865 | -98.6753 | -1.3058 | 0.0864 | 0.6929 | 0.0171 | 0.5233 | 0.5399 | 0.5243 |
72
+ | 0.1094 | 0.5668 | 300 | 0.0928 | -98.3448 | -1.3087 | 0.0930 | 0.6926 | 0.0238 | 0.5238 | 0.5399 | 0.5243 |
73
+ | 0.1267 | 0.6613 | 350 | 0.0995 | -98.4803 | -1.3097 | 0.1004 | 0.6942 | 0.0310 | 0.5243 | 0.5399 | 0.5243 |
74
+ | 0.1414 | 0.7558 | 400 | 0.1020 | -98.6999 | -1.3138 | 0.1027 | 0.6920 | 0.0335 | 0.5248 | 0.5399 | 0.5243 |
75
+ | 0.156 | 0.8503 | 450 | 0.1102 | -98.6961 | -1.3001 | 0.1107 | 0.6917 | 0.0415 | 0.5228 | 0.5399 | 0.5243 |
76
+ | 0.1843 | 0.9447 | 500 | 0.1425 | -98.3685 | -1.2985 | 0.1416 | 0.6934 | 0.0723 | 0.5217 | 0.5399 | 0.5243 |
77
+ | 0.1954 | 1.0392 | 550 | 0.1388 | -98.2336 | -1.3154 | 0.1383 | 0.6946 | 0.0689 | 0.5228 | 0.5399 | 0.5243 |
78
+ | 0.2073 | 1.1337 | 600 | 0.1374 | -98.7215 | -1.3071 | 0.1369 | 0.6936 | 0.0676 | 0.5228 | 0.5399 | 0.5243 |
79
+ | 0.2165 | 1.2282 | 650 | 0.1478 | -97.9261 | -1.2926 | 0.1487 | 0.6916 | 0.0796 | 0.5233 | 0.5399 | 0.5243 |
80
+ | 0.2333 | 1.3226 | 700 | 0.1470 | -97.1071 | -1.2930 | 0.1450 | 0.6915 | 0.0758 | 0.5243 | 0.5399 | 0.5243 |
81
+ | 0.229 | 1.4171 | 750 | 0.1718 | -97.0923 | -1.2689 | 0.1725 | 0.6929 | 0.1032 | 0.5238 | 0.5399 | 0.5243 |
82
+ | 0.2565 | 1.5116 | 800 | 0.1817 | -97.2621 | -1.2540 | 0.1830 | 0.6944 | 0.1136 | 0.5243 | 0.5399 | 0.5243 |
83
+ | 0.2479 | 1.6060 | 850 | 0.1864 | -96.3423 | -1.2708 | 0.1853 | 0.6946 | 0.1159 | 0.5243 | 0.5399 | 0.5243 |
84
+ | 0.2586 | 1.7005 | 900 | 0.1839 | -97.2157 | -1.2623 | 0.1825 | 0.6944 | 0.1131 | 0.5223 | 0.5399 | 0.5243 |
85
+ | 0.2347 | 1.7950 | 950 | 0.1995 | -94.8402 | -1.2678 | 0.1989 | 0.6945 | 0.1295 | 0.5238 | 0.5399 | 0.5243 |
86
+ | 0.2414 | 1.8895 | 1000 | 0.1895 | -95.8793 | -1.2579 | 0.1901 | 0.6924 | 0.1209 | 0.5254 | 0.5399 | 0.5243 |
87
+ | 0.2433 | 1.9839 | 1050 | 0.2097 | -95.7970 | -1.2552 | 0.2068 | 0.6923 | 0.1376 | 0.5259 | 0.5399 | 0.5243 |
88
+ | 0.2393 | 2.0784 | 1100 | 0.2156 | -96.9313 | -1.2422 | 0.2149 | 0.6962 | 0.1452 | 0.5264 | 0.5399 | 0.5243 |
89
+ | 0.2476 | 2.1729 | 1150 | 0.2195 | -95.8618 | -1.2485 | 0.2191 | 0.6958 | 0.1495 | 0.5238 | 0.5399 | 0.5243 |
90
+ | 0.2443 | 2.2674 | 1200 | 0.2318 | -97.1362 | -1.2241 | 0.2317 | 0.6998 | 0.1617 | 0.5259 | 0.5399 | 0.5243 |
91
+ | 0.2337 | 2.3618 | 1250 | 0.2494 | -96.2629 | -1.2313 | 0.2515 | 0.6950 | 0.1820 | 0.5269 | 0.5399 | 0.5243 |
92
+ | 0.2264 | 2.4563 | 1300 | 0.2473 | -94.4504 | -1.2535 | 0.2456 | 0.6981 | 0.1758 | 0.5223 | 0.5399 | 0.5243 |
93
+ | 0.2398 | 2.5508 | 1350 | 0.2467 | -96.2065 | -1.2349 | 0.2462 | 0.7027 | 0.1760 | 0.5197 | 0.5399 | 0.5243 |
94
+ | 0.2346 | 2.6453 | 1400 | 0.2565 | -94.6591 | -1.2562 | 0.2567 | 0.7002 | 0.1867 | 0.5212 | 0.5399 | 0.5243 |
95
+ | 0.242 | 2.7397 | 1450 | 0.2640 | -94.6555 | -1.2141 | 0.2641 | 0.7015 | 0.1939 | 0.5243 | 0.5399 | 0.5243 |
96
+ | 0.2372 | 2.8342 | 1500 | 0.2747 | -94.9289 | -1.2472 | 0.2726 | 0.7027 | 0.2024 | 0.5202 | 0.5399 | 0.5243 |
97
+ | 0.2133 | 2.9287 | 1550 | 0.2529 | -95.1991 | -1.2345 | 0.2512 | 0.7006 | 0.1811 | 0.5243 | 0.5399 | 0.5243 |
98
+ | 0.2292 | 3.0231 | 1600 | 0.2840 | -93.6334 | -1.2437 | 0.2861 | 0.7038 | 0.2157 | 0.5197 | 0.5399 | 0.5243 |
99
+ | 0.2227 | 3.1176 | 1650 | 0.2854 | -93.4763 | -1.2332 | 0.2851 | 0.7025 | 0.2149 | 0.5217 | 0.5399 | 0.5243 |
100
+ | 0.2123 | 3.2121 | 1700 | 0.2752 | -95.6906 | -1.2311 | 0.2756 | 0.7008 | 0.2055 | 0.5233 | 0.5399 | 0.5243 |
101
+ | 0.218 | 3.3066 | 1750 | 0.2800 | -95.9042 | -1.2167 | 0.2783 | 0.7037 | 0.2079 | 0.5238 | 0.5399 | 0.5243 |
102
+ | 0.2086 | 3.4010 | 1800 | 0.2945 | -95.6983 | -1.2183 | 0.2932 | 0.7027 | 0.2230 | 0.5233 | 0.5399 | 0.5243 |
103
+ | 0.216 | 3.4955 | 1850 | 0.2895 | -93.0784 | -1.2235 | 0.2873 | 0.7028 | 0.2171 | 0.5212 | 0.5399 | 0.5243 |
104
+ | 0.2182 | 3.5900 | 1900 | 0.2973 | -95.2384 | -1.2138 | 0.2977 | 0.7019 | 0.2275 | 0.5207 | 0.5399 | 0.5243 |
105
+ | 0.2097 | 3.6845 | 1950 | 0.3023 | -93.4940 | -1.2111 | 0.3000 | 0.7046 | 0.2295 | 0.5217 | 0.5399 | 0.5243 |
106
+ | 0.2076 | 3.7789 | 2000 | 0.3084 | -93.0939 | -1.2337 | 0.3067 | 0.7034 | 0.2364 | 0.5243 | 0.5399 | 0.5243 |
107
+ | 0.2099 | 3.8734 | 2050 | 0.2962 | -93.1727 | -1.2280 | 0.2954 | 0.7044 | 0.2249 | 0.5212 | 0.5399 | 0.5243 |
108
+ | 0.2001 | 3.9679 | 2100 | 0.3139 | -93.9210 | -1.2079 | 0.3123 | 0.7063 | 0.2417 | 0.5186 | 0.5399 | 0.5243 |
109
+ | 0.2082 | 4.0624 | 2150 | 0.3119 | -93.6768 | -1.2148 | 0.3124 | 0.7037 | 0.2420 | 0.5217 | 0.5399 | 0.5243 |
110
+ | 0.1914 | 4.1568 | 2200 | 0.3139 | -94.5737 | -1.2179 | 0.3138 | 0.7032 | 0.2434 | 0.5197 | 0.5399 | 0.5243 |
111
+ | 0.2026 | 4.2513 | 2250 | 0.3179 | -93.2220 | -1.2044 | 0.3177 | 0.7035 | 0.2473 | 0.5202 | 0.5399 | 0.5243 |
112
+ | 0.1908 | 4.3458 | 2300 | 0.3067 | -94.3151 | -1.2117 | 0.3085 | 0.7022 | 0.2383 | 0.5233 | 0.5399 | 0.5243 |
113
+ | 0.1931 | 4.4402 | 2350 | 0.3241 | -93.4124 | -1.2066 | 0.3236 | 0.7058 | 0.2530 | 0.5223 | 0.5399 | 0.5243 |
114
+ | 0.195 | 4.5347 | 2400 | 0.3111 | -94.2419 | -1.2062 | 0.3113 | 0.7035 | 0.2410 | 0.5217 | 0.5399 | 0.5243 |
115
+ | 0.1947 | 4.6292 | 2450 | 0.3312 | -93.6715 | -1.1956 | 0.3317 | 0.7067 | 0.2610 | 0.5228 | 0.5399 | 0.5243 |
116
+ | 0.1837 | 4.7237 | 2500 | 0.3289 | -93.6179 | -1.2041 | 0.3304 | 0.7077 | 0.2596 | 0.5223 | 0.5399 | 0.5243 |
117
+ | 0.1751 | 4.8181 | 2550 | 0.3254 | -93.4709 | -1.1993 | 0.3247 | 0.7060 | 0.2541 | 0.5212 | 0.5399 | 0.5243 |
118
+ | 0.1717 | 4.9126 | 2600 | 0.3287 | -94.2886 | -1.2078 | 0.3292 | 0.7050 | 0.2587 | 0.5207 | 0.5399 | 0.5243 |
119
+ | 0.1761 | 5.0071 | 2650 | 0.3257 | -93.6210 | -1.2055 | 0.3239 | 0.7061 | 0.2533 | 0.5217 | 0.5399 | 0.5243 |
120
+ | 0.1692 | 5.1016 | 2700 | 0.3396 | -93.0109 | -1.2063 | 0.3378 | 0.7072 | 0.2670 | 0.5223 | 0.5399 | 0.5243 |
121
+ | 0.1676 | 5.1960 | 2750 | 0.3402 | -93.9591 | -1.1978 | 0.3384 | 0.7084 | 0.2675 | 0.5202 | 0.5399 | 0.5243 |
122
+ | 0.1743 | 5.2905 | 2800 | 0.3371 | -93.9100 | -1.1972 | 0.3351 | 0.7076 | 0.2643 | 0.5217 | 0.5399 | 0.5243 |
123
+ | 0.1715 | 5.3850 | 2850 | 0.3408 | -93.6808 | -1.1939 | 0.3405 | 0.7084 | 0.2696 | 0.5212 | 0.5399 | 0.5243 |
124
+ | 0.1643 | 5.4795 | 2900 | 0.3434 | -93.0381 | -1.1941 | 0.3434 | 0.7095 | 0.2724 | 0.5192 | 0.5399 | 0.5243 |
125
+ | 0.1569 | 5.5739 | 2950 | 0.3403 | -94.4489 | -1.1993 | 0.3406 | 0.7083 | 0.2698 | 0.5192 | 0.5399 | 0.5243 |
126
+ | 0.16 | 5.6684 | 3000 | 0.3337 | -94.1339 | -1.1952 | 0.3332 | 0.7068 | 0.2625 | 0.5233 | 0.5399 | 0.5243 |
127
+ | 0.1556 | 5.7629 | 3050 | 0.3379 | -93.7011 | -1.1943 | 0.3366 | 0.7075 | 0.2658 | 0.5197 | 0.5399 | 0.5243 |
128
+ | 0.1544 | 5.8573 | 3100 | 0.3407 | -93.8059 | -1.1896 | 0.3385 | 0.7082 | 0.2677 | 0.5212 | 0.5399 | 0.5243 |
129
+ | 0.1539 | 5.9518 | 3150 | 0.3377 | -93.3647 | -1.2013 | 0.3358 | 0.7079 | 0.2650 | 0.5207 | 0.5399 | 0.5243 |
130
+ | 0.1448 | 6.0463 | 3200 | 0.3418 | -93.0674 | -1.1912 | 0.3402 | 0.7086 | 0.2693 | 0.5181 | 0.5399 | 0.5243 |
131
+ | 0.1479 | 6.1408 | 3250 | 0.3437 | -93.1651 | -1.1883 | 0.3423 | 0.7079 | 0.2715 | 0.5217 | 0.5399 | 0.5243 |
132
+ | 0.1408 | 6.2352 | 3300 | 0.3427 | -93.4029 | -1.1821 | 0.3405 | 0.7074 | 0.2698 | 0.5197 | 0.5399 | 0.5243 |
133
+ | 0.1475 | 6.3297 | 3350 | 0.3401 | -93.6032 | -1.1856 | 0.3383 | 0.7078 | 0.2675 | 0.5192 | 0.5399 | 0.5243 |
134
+ | 0.1339 | 6.4242 | 3400 | 0.3415 | -93.5229 | -1.1891 | 0.3402 | 0.7082 | 0.2693 | 0.5212 | 0.5399 | 0.5243 |
135
+ | 0.1394 | 6.5187 | 3450 | 0.3398 | -94.0518 | -1.1959 | 0.3379 | 0.7083 | 0.2671 | 0.5186 | 0.5399 | 0.5243 |
136
+ | 0.1324 | 6.6131 | 3500 | 0.3401 | -93.9466 | -1.1836 | 0.3389 | 0.7075 | 0.2682 | 0.5192 | 0.5399 | 0.5243 |
137
+ | 0.1385 | 6.7076 | 3550 | 0.3449 | -93.6245 | -1.1866 | 0.3437 | 0.7080 | 0.2729 | 0.5202 | 0.5399 | 0.5243 |
138
+ | 0.1289 | 6.8021 | 3600 | 0.3433 | -93.8482 | -1.1858 | 0.3412 | 0.7088 | 0.2703 | 0.5192 | 0.5399 | 0.5243 |
139
+ | 0.1272 | 6.8966 | 3650 | 0.3431 | -93.9371 | -1.1979 | 0.3417 | 0.7080 | 0.2709 | 0.5202 | 0.5399 | 0.5243 |
140
+ | 0.125 | 6.9910 | 3700 | 0.3436 | -93.9666 | -1.1952 | 0.3425 | 0.7079 | 0.2717 | 0.5202 | 0.5399 | 0.5243 |
141
+ | 0.1227 | 7.0855 | 3750 | 0.3404 | -93.8781 | -1.2022 | 0.3382 | 0.7086 | 0.2674 | 0.5197 | 0.5399 | 0.5243 |
142
+ | 0.1142 | 7.1800 | 3800 | 0.3426 | -93.8234 | -1.1874 | 0.3420 | 0.7083 | 0.2712 | 0.5207 | 0.5399 | 0.5243 |
143
+ | 0.1142 | 7.2744 | 3850 | 0.3454 | -93.6895 | -1.1775 | 0.3442 | 0.7090 | 0.2733 | 0.5202 | 0.5399 | 0.5243 |
144
+ | 0.1128 | 7.3689 | 3900 | 0.3417 | -94.0521 | -1.1838 | 0.3406 | 0.7083 | 0.2698 | 0.5197 | 0.5399 | 0.5243 |
145
+ | 0.1158 | 7.4634 | 3950 | 0.3434 | -93.9208 | -1.1875 | 0.3423 | 0.7086 | 0.2714 | 0.5197 | 0.5399 | 0.5243 |
146
+ | 0.113 | 7.5579 | 4000 | 0.3428 | -93.6866 | -1.1850 | 0.3411 | 0.7087 | 0.2702 | 0.5197 | 0.5399 | 0.5243 |
147
+ | 0.1113 | 7.6523 | 4050 | 0.3434 | -93.6171 | -1.1837 | 0.3425 | 0.7087 | 0.2716 | 0.5202 | 0.5399 | 0.5243 |
148
+ | 0.1082 | 7.7468 | 4100 | 0.3411 | -94.0013 | -1.1852 | 0.3403 | 0.7081 | 0.2695 | 0.5192 | 0.5399 | 0.5243 |
149
+ | 0.1051 | 7.8413 | 4150 | 0.3425 | -93.8552 | -1.1848 | 0.3417 | 0.7083 | 0.2709 | 0.5197 | 0.5399 | 0.5243 |
150
+ | 0.1047 | 7.9358 | 4200 | 0.3422 | -93.6696 | -1.1872 | 0.3411 | 0.7085 | 0.2702 | 0.5197 | 0.5399 | 0.5243 |
151
+ | 0.0985 | 8.0302 | 4250 | 0.3416 | -93.6924 | -1.1844 | 0.3403 | 0.7083 | 0.2695 | 0.5197 | 0.5399 | 0.5243 |
152
+ | 0.0964 | 8.1247 | 4300 | 0.3422 | -93.5025 | -1.1871 | 0.3409 | 0.7082 | 0.2701 | 0.5202 | 0.5399 | 0.5243 |
153
+ | 0.0997 | 8.2192 | 4350 | 0.3423 | -93.8074 | -1.1866 | 0.3408 | 0.7081 | 0.2700 | 0.5186 | 0.5399 | 0.5243 |
154
+ | 0.0963 | 8.3137 | 4400 | 0.3434 | -93.6885 | -1.1861 | 0.3419 | 0.7084 | 0.2711 | 0.5202 | 0.5399 | 0.5243 |
155
+ | 0.0966 | 8.4081 | 4450 | 0.3434 | -93.7312 | -1.1875 | 0.3419 | 0.7084 | 0.2711 | 0.5186 | 0.5399 | 0.5243 |
156
+ | 0.0956 | 8.5026 | 4500 | 0.3431 | -93.8431 | -1.1866 | 0.3416 | 0.7081 | 0.2708 | 0.5186 | 0.5399 | 0.5243 |
157
+ | 0.0928 | 8.5971 | 4550 | 0.3428 | -93.8243 | -1.1859 | 0.3414 | 0.7084 | 0.2706 | 0.5186 | 0.5399 | 0.5243 |
158
+ | 0.0924 | 8.6915 | 4600 | 0.3418 | -93.7706 | -1.1871 | 0.3406 | 0.7082 | 0.2698 | 0.5186 | 0.5399 | 0.5243 |
159
+ | 0.0908 | 8.7860 | 4650 | 0.3415 | -93.7405 | -1.1872 | 0.3403 | 0.7079 | 0.2695 | 0.5202 | 0.5399 | 0.5243 |
160
+ | 0.0922 | 8.8805 | 4700 | 0.3419 | -93.7126 | -1.1888 | 0.3405 | 0.7078 | 0.2698 | 0.5202 | 0.5399 | 0.5243 |
161
+ | 0.0895 | 8.9750 | 4750 | 0.3417 | -93.7926 | -1.1886 | 0.3402 | 0.7080 | 0.2694 | 0.5202 | 0.5399 | 0.5243 |
162
+ | 0.0877 | 9.0694 | 4800 | 0.3425 | -93.7523 | -1.1891 | 0.3415 | 0.7083 | 0.2706 | 0.5197 | 0.5399 | 0.5243 |
163
+ | 0.0862 | 9.1639 | 4850 | 0.3423 | -93.8492 | -1.1894 | 0.3406 | 0.7082 | 0.2698 | 0.5207 | 0.5399 | 0.5243 |
164
+ | 0.0856 | 9.2584 | 4900 | 0.3417 | -93.8453 | -1.1883 | 0.3404 | 0.7081 | 0.2696 | 0.5197 | 0.5399 | 0.5243 |
165
+ | 0.0883 | 9.3529 | 4950 | 0.3414 | -93.8773 | -1.1886 | 0.3401 | 0.7080 | 0.2693 | 0.5202 | 0.5399 | 0.5243 |
166
+ | 0.0866 | 9.4473 | 5000 | 0.3414 | -93.8593 | -1.1880 | 0.3402 | 0.7081 | 0.2694 | 0.5197 | 0.5399 | 0.5243 |
167
+ | 0.0843 | 9.5418 | 5050 | 0.3417 | -93.8241 | -1.1880 | 0.3405 | 0.7081 | 0.2697 | 0.5207 | 0.5399 | 0.5243 |
168
+ | 0.0862 | 9.6363 | 5100 | 0.3419 | -93.8268 | -1.1884 | 0.3404 | 0.7081 | 0.2696 | 0.5197 | 0.5399 | 0.5243 |
169
+ | 0.0851 | 9.7308 | 5150 | 0.3418 | -93.8247 | -1.1881 | 0.3405 | 0.7082 | 0.2697 | 0.5192 | 0.5399 | 0.5243 |
170
+ | 0.0852 | 9.8252 | 5200 | 0.3415 | -93.8257 | -1.1886 | 0.3402 | 0.7081 | 0.2694 | 0.5197 | 0.5399 | 0.5243 |
171
+ | 0.0873 | 9.9197 | 5250 | 0.3418 | -93.8220 | -1.1885 | 0.3404 | 0.7083 | 0.2696 | 0.5197 | 0.5399 | 0.5243 |
172
+
173
+
174
+ ### Framework versions
175
+
176
+ - Transformers 4.42.0
177
+ - Pytorch 2.3.0+cu121
178
+ - Datasets 2.19.1
179
+ - Tokenizers 0.19.1
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.995276334435522,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.15836801591183153,
5
+ "train_runtime": 49865.2618,
6
+ "train_samples": 50802,
7
+ "train_samples_per_second": 10.188,
8
+ "train_steps_per_second": 0.106
9
+ }
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "hZzy/qwen2.5-0.5b-sft-news-IFT",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151644,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 24,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "pad_token_id": 151645,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": 32768,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "float16",
25
+ "transformers_version": "4.42.0",
26
+ "use_cache": false,
27
+ "use_mrope": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151665
30
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151644,
3
+ "eos_token_id": 151645,
4
+ "max_new_tokens": 2048,
5
+ "pad_token_id": 151645,
6
+ "transformers_version": "4.42.0"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:384313fb1360a2598e06964ad0ef2c2c174bd7e704195febc19430c83db50c2c
3
+ size 987611904
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ }
27
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>"
185
+ ],
186
+ "bos_token": "<|im_start|>",
187
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
188
+ "clean_up_tokenization_spaces": false,
189
+ "eos_token": "<|im_end|>",
190
+ "errors": "replace",
191
+ "model_max_length": 2048,
192
+ "pad_token": "<|im_end|>",
193
+ "split_special_tokens": false,
194
+ "tokenizer_class": "Qwen2Tokenizer",
195
+ "unk_token": null
196
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.995276334435522,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.15836801591183153,
5
+ "train_runtime": 49865.2618,
6
+ "train_samples": 50802,
7
+ "train_samples_per_second": 10.188,
8
+ "train_steps_per_second": 0.106
9
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10621b82969f8423c7f1786671fff7586249d2e90d14a5c932545238f115038f
3
+ size 8120
vocab.json ADDED
The diff for this file is too large to render. See raw diff