Alex Shafranovich
commited on
Commit
•
6070cba
1
Parent(s):
e695e96
Upload folder using huggingface_hub
Browse files- README.md +143 -0
- all_results.json +22 -0
- checkpoint-29669/config.json +31 -0
- checkpoint-29669/generation_config.json +8 -0
- checkpoint-29669/merges.txt +0 -0
- checkpoint-29669/model-00001-of-00002.safetensors +3 -0
- checkpoint-29669/model-00002-of-00002.safetensors +3 -0
- checkpoint-29669/model.safetensors.index.json +225 -0
- checkpoint-29669/optimizer.pt +3 -0
- checkpoint-29669/rng_state.pth +3 -0
- checkpoint-29669/scheduler.pt +3 -0
- checkpoint-29669/special_tokens_map.json +34 -0
- checkpoint-29669/tokenizer.json +0 -0
- checkpoint-29669/tokenizer_config.json +154 -0
- checkpoint-29669/trainer_state.json +0 -0
- checkpoint-29669/training_args.bin +3 -0
- checkpoint-29669/vocab.json +0 -0
- config.json +31 -0
- eval_results.json +16 -0
- generation_config.json +8 -0
- merges.txt +0 -0
- model-00001-of-00002.safetensors +3 -0
- model-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +225 -0
- special_tokens_map.json +34 -0
- tokenizer.json +0 -0
- tokenizer_config.json +154 -0
- train_results.json +9 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
license: apache-2.0
|
4 |
+
base_model: HuggingFaceTB/SmolLM-1.7B-Instruct
|
5 |
+
tags:
|
6 |
+
- alignment-handbook
|
7 |
+
- generated_from_trainer
|
8 |
+
datasets:
|
9 |
+
- BAAI/Infinity-Preference
|
10 |
+
model-index:
|
11 |
+
- name: smollm-1.7b-instruct-simpo-v2
|
12 |
+
results: []
|
13 |
+
---
|
14 |
+
|
15 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
16 |
+
should probably proofread and complete it, then remove this comment. -->
|
17 |
+
|
18 |
+
# smollm-1.7b-instruct-simpo-v2
|
19 |
+
|
20 |
+
This model is a fine-tuned version of [HuggingFaceTB/SmolLM-1.7B-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM-1.7B-Instruct) on the BAAI/Infinity-Preference dataset.
|
21 |
+
It achieves the following results on the evaluation set:
|
22 |
+
- Loss: 3.0877
|
23 |
+
- Rewards/chosen: -22.8949
|
24 |
+
- Rewards/rejected: -24.4444
|
25 |
+
- Rewards/accuracies: 0.6300
|
26 |
+
- Rewards/margins: 1.5495
|
27 |
+
- Logps/rejected: -2.4444
|
28 |
+
- Logps/chosen: -2.2895
|
29 |
+
- Logits/rejected: -2.4913
|
30 |
+
- Logits/chosen: -2.3131
|
31 |
+
|
32 |
+
## Model description
|
33 |
+
|
34 |
+
More information needed
|
35 |
+
|
36 |
+
## Intended uses & limitations
|
37 |
+
|
38 |
+
More information needed
|
39 |
+
|
40 |
+
## Training and evaluation data
|
41 |
+
|
42 |
+
More information needed
|
43 |
+
|
44 |
+
## Training procedure
|
45 |
+
|
46 |
+
### Training hyperparameters
|
47 |
+
|
48 |
+
The following hyperparameters were used during training:
|
49 |
+
- learning_rate: 1e-06
|
50 |
+
- train_batch_size: 2
|
51 |
+
- eval_batch_size: 4
|
52 |
+
- seed: 42
|
53 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
54 |
+
- lr_scheduler_type: cosine
|
55 |
+
- lr_scheduler_warmup_ratio: 0.1
|
56 |
+
- num_epochs: 1
|
57 |
+
|
58 |
+
### Training results
|
59 |
+
|
60 |
+
| Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
|
61 |
+
|:-------------:|:------:|:-----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
|
62 |
+
| 3.2871 | 0.0135 | 400 | 3.4379 | -16.5537 | -16.5135 | 0.4700 | -0.0402 | -1.6513 | -1.6554 | -0.7019 | -0.7007 |
|
63 |
+
| 3.4746 | 0.0270 | 800 | 3.4370 | -16.5561 | -16.5146 | 0.4700 | -0.0415 | -1.6515 | -1.6556 | -0.7002 | -0.6988 |
|
64 |
+
| 2.8856 | 0.0404 | 1200 | 3.4399 | -16.5623 | -16.5160 | 0.4700 | -0.0464 | -1.6516 | -1.6562 | -0.6997 | -0.6984 |
|
65 |
+
| 3.8819 | 0.0539 | 1600 | 3.4374 | -16.5639 | -16.5248 | 0.4700 | -0.0391 | -1.6525 | -1.6564 | -0.7012 | -0.6998 |
|
66 |
+
| 3.622 | 0.0674 | 2000 | 3.4319 | -16.5838 | -16.5551 | 0.4700 | -0.0288 | -1.6555 | -1.6584 | -0.7089 | -0.7069 |
|
67 |
+
| 3.6924 | 0.0809 | 2400 | 3.4273 | -16.6109 | -16.5901 | 0.4700 | -0.0208 | -1.6590 | -1.6611 | -0.7032 | -0.7007 |
|
68 |
+
| 3.0591 | 0.0944 | 2800 | 3.4161 | -16.6863 | -16.6979 | 0.4600 | 0.0117 | -1.6698 | -1.6686 | -0.7295 | -0.7253 |
|
69 |
+
| 3.4937 | 0.1079 | 3200 | 3.4013 | -16.7982 | -16.8590 | 0.4700 | 0.0608 | -1.6859 | -1.6798 | -0.7483 | -0.7412 |
|
70 |
+
| 3.1565 | 0.1213 | 3600 | 3.3852 | -16.8542 | -16.9385 | 0.4700 | 0.0843 | -1.6939 | -1.6854 | -0.7618 | -0.7526 |
|
71 |
+
| 2.7504 | 0.1348 | 4000 | 3.3711 | -16.9128 | -17.0175 | 0.4800 | 0.1047 | -1.7018 | -1.6913 | -0.7684 | -0.7574 |
|
72 |
+
| 3.0312 | 0.1483 | 4400 | 3.3606 | -16.9720 | -17.0910 | 0.4900 | 0.1190 | -1.7091 | -1.6972 | -0.7754 | -0.7629 |
|
73 |
+
| 4.145 | 0.1618 | 4800 | 3.3407 | -17.0816 | -17.2375 | 0.5100 | 0.1559 | -1.7238 | -1.7082 | -0.7902 | -0.7746 |
|
74 |
+
| 3.9514 | 0.1753 | 5200 | 3.3126 | -17.1952 | -17.3924 | 0.5100 | 0.1972 | -1.7392 | -1.7195 | -0.8201 | -0.8001 |
|
75 |
+
| 2.4942 | 0.1887 | 5600 | 3.2864 | -17.2731 | -17.4955 | 0.5100 | 0.2223 | -1.7495 | -1.7273 | -0.8187 | -0.7960 |
|
76 |
+
| 2.6757 | 0.2022 | 6000 | 3.2615 | -17.3603 | -17.6063 | 0.5200 | 0.2460 | -1.7606 | -1.7360 | -0.7977 | -0.7735 |
|
77 |
+
| 2.8576 | 0.2157 | 6400 | 3.2382 | -17.5060 | -17.8132 | 0.5500 | 0.3072 | -1.7813 | -1.7506 | -0.8562 | -0.8260 |
|
78 |
+
| 3.7483 | 0.2292 | 6800 | 3.2140 | -17.5965 | -17.9376 | 0.5700 | 0.3411 | -1.7938 | -1.7596 | -0.8751 | -0.8407 |
|
79 |
+
| 3.5349 | 0.2427 | 7200 | 3.2035 | -17.6663 | -18.0193 | 0.5800 | 0.3530 | -1.8019 | -1.7666 | -0.8780 | -0.8417 |
|
80 |
+
| 2.0604 | 0.2562 | 7600 | 3.1925 | -17.7393 | -18.1045 | 0.6100 | 0.3652 | -1.8104 | -1.7739 | -0.9017 | -0.8602 |
|
81 |
+
| 5.7031 | 0.2696 | 8000 | 3.1672 | -18.0175 | -18.4936 | 0.6100 | 0.4760 | -1.8494 | -1.8018 | -0.9982 | -0.9467 |
|
82 |
+
| 2.6005 | 0.2831 | 8400 | 3.1475 | -18.1162 | -18.6283 | 0.6100 | 0.5121 | -1.8628 | -1.8116 | -1.0732 | -1.0161 |
|
83 |
+
| 1.9787 | 0.2966 | 8800 | 3.1226 | -18.3260 | -18.9198 | 0.6100 | 0.5938 | -1.8920 | -1.8326 | -1.1691 | -1.1062 |
|
84 |
+
| 2.8347 | 0.3101 | 9200 | 3.1156 | -18.4632 | -19.0934 | 0.6100 | 0.6301 | -1.9093 | -1.8463 | -1.2592 | -1.1910 |
|
85 |
+
| 2.701 | 0.3236 | 9600 | 3.1022 | -18.5083 | -19.1346 | 0.6100 | 0.6264 | -1.9135 | -1.8508 | -1.2785 | -1.2073 |
|
86 |
+
| 3.772 | 0.3371 | 10000 | 3.0772 | -18.5843 | -19.2491 | 0.6100 | 0.6649 | -1.9249 | -1.8584 | -1.3345 | -1.2587 |
|
87 |
+
| 2.7414 | 0.3505 | 10400 | 3.0551 | -18.8305 | -19.5946 | 0.6100 | 0.7641 | -1.9595 | -1.8830 | -1.3824 | -1.3004 |
|
88 |
+
| 2.0287 | 0.3640 | 10800 | 3.0534 | -18.9934 | -19.7985 | 0.6200 | 0.8051 | -1.9798 | -1.8993 | -1.4355 | -1.3467 |
|
89 |
+
| 1.0473 | 0.3775 | 11200 | 3.0528 | -19.1581 | -19.9858 | 0.6100 | 0.8277 | -1.9986 | -1.9158 | -1.5109 | -1.4173 |
|
90 |
+
| 2.8106 | 0.3910 | 11600 | 3.0436 | -19.1763 | -19.9989 | 0.6100 | 0.8226 | -1.9999 | -1.9176 | -1.5138 | -1.4206 |
|
91 |
+
| 3.0344 | 0.4045 | 12000 | 3.0333 | -19.2526 | -20.1079 | 0.6100 | 0.8553 | -2.0108 | -1.9253 | -1.5628 | -1.4657 |
|
92 |
+
| 2.1886 | 0.4179 | 12400 | 3.0187 | -19.4500 | -20.3818 | 0.6300 | 0.9318 | -2.0382 | -1.9450 | -1.6246 | -1.5217 |
|
93 |
+
| 4.1181 | 0.4314 | 12800 | 3.0086 | -19.6204 | -20.6104 | 0.6300 | 0.9900 | -2.0610 | -1.9620 | -1.6886 | -1.5818 |
|
94 |
+
| 1.6647 | 0.4449 | 13200 | 3.0126 | -19.7773 | -20.7949 | 0.6300 | 1.0176 | -2.0795 | -1.9777 | -1.7307 | -1.6181 |
|
95 |
+
| 4.8533 | 0.4584 | 13600 | 3.0012 | -19.9001 | -20.9633 | 0.6300 | 1.0632 | -2.0963 | -1.9900 | -1.7437 | -1.6288 |
|
96 |
+
| 2.9945 | 0.4719 | 14000 | 3.0071 | -19.9831 | -21.0361 | 0.6300 | 1.0529 | -2.1036 | -1.9983 | -1.7839 | -1.6667 |
|
97 |
+
| 2.9377 | 0.4854 | 14400 | 2.9946 | -20.1165 | -21.2172 | 0.6400 | 1.1007 | -2.1217 | -2.0117 | -1.8386 | -1.7178 |
|
98 |
+
| 2.7856 | 0.4988 | 14800 | 2.9908 | -20.2830 | -21.4151 | 0.6300 | 1.1322 | -2.1415 | -2.0283 | -1.8720 | -1.7468 |
|
99 |
+
| 4.9446 | 0.5123 | 15200 | 2.9905 | -20.4144 | -21.5669 | 0.6300 | 1.1525 | -2.1567 | -2.0414 | -1.9057 | -1.7760 |
|
100 |
+
| 3.2834 | 0.5258 | 15600 | 2.9858 | -20.4428 | -21.5993 | 0.6300 | 1.1565 | -2.1599 | -2.0443 | -1.8928 | -1.7633 |
|
101 |
+
| 1.8705 | 0.5393 | 16000 | 2.9888 | -20.5922 | -21.7774 | 0.6300 | 1.1853 | -2.1777 | -2.0592 | -1.9340 | -1.8009 |
|
102 |
+
| 4.0587 | 0.5528 | 16400 | 2.9925 | -20.8812 | -22.1359 | 0.6300 | 1.2547 | -2.2136 | -2.0881 | -2.0019 | -1.8627 |
|
103 |
+
| 3.0706 | 0.5662 | 16800 | 2.9946 | -21.1005 | -22.4176 | 0.6300 | 1.3171 | -2.2418 | -2.1101 | -2.0533 | -1.9104 |
|
104 |
+
| 3.152 | 0.5797 | 17200 | 2.9916 | -21.2937 | -22.6723 | 0.6200 | 1.3786 | -2.2672 | -2.1294 | -2.1094 | -1.9627 |
|
105 |
+
| 1.8856 | 0.5932 | 17600 | 2.9847 | -21.2727 | -22.6463 | 0.6200 | 1.3736 | -2.2646 | -2.1273 | -2.1108 | -1.9637 |
|
106 |
+
| 1.1291 | 0.6067 | 18000 | 2.9981 | -21.5313 | -22.9507 | 0.6200 | 1.4194 | -2.2951 | -2.1531 | -2.1736 | -2.0212 |
|
107 |
+
| 2.9894 | 0.6202 | 18400 | 3.0033 | -21.6191 | -23.0276 | 0.6200 | 1.4085 | -2.3028 | -2.1619 | -2.2089 | -2.0543 |
|
108 |
+
| 3.497 | 0.6337 | 18800 | 3.0252 | -21.8198 | -23.2426 | 0.6200 | 1.4228 | -2.3243 | -2.1820 | -2.2285 | -2.0714 |
|
109 |
+
| 3.18 | 0.6471 | 19200 | 3.0307 | -21.8887 | -23.3005 | 0.6200 | 1.4117 | -2.3300 | -2.1889 | -2.2462 | -2.0862 |
|
110 |
+
| 1.9522 | 0.6606 | 19600 | 3.0391 | -21.9179 | -23.3214 | 0.6300 | 1.4035 | -2.3321 | -2.1918 | -2.2476 | -2.0875 |
|
111 |
+
| 2.4878 | 0.6741 | 20000 | 3.0431 | -22.1021 | -23.5543 | 0.6300 | 1.4522 | -2.3554 | -2.2102 | -2.2969 | -2.1333 |
|
112 |
+
| 2.3506 | 0.6876 | 20400 | 3.0453 | -22.2379 | -23.7220 | 0.6300 | 1.4841 | -2.3722 | -2.2238 | -2.3258 | -2.1603 |
|
113 |
+
| 3.9719 | 0.7011 | 20800 | 3.0591 | -22.2718 | -23.7317 | 0.6300 | 1.4599 | -2.3732 | -2.2272 | -2.3263 | -2.1600 |
|
114 |
+
| 1.4942 | 0.7146 | 21200 | 3.0574 | -22.3226 | -23.8044 | 0.6300 | 1.4819 | -2.3804 | -2.2323 | -2.3352 | -2.1680 |
|
115 |
+
| 0.8797 | 0.7280 | 21600 | 3.0616 | -22.3419 | -23.8235 | 0.6300 | 1.4816 | -2.3823 | -2.2342 | -2.3394 | -2.1721 |
|
116 |
+
| 2.8176 | 0.7415 | 22000 | 3.0751 | -22.4788 | -23.9643 | 0.6300 | 1.4855 | -2.3964 | -2.2479 | -2.3767 | -2.2073 |
|
117 |
+
| 3.3744 | 0.7550 | 22400 | 3.0775 | -22.6028 | -24.1137 | 0.6300 | 1.5109 | -2.4114 | -2.2603 | -2.4146 | -2.2423 |
|
118 |
+
| 1.9708 | 0.7685 | 22800 | 3.0768 | -22.6249 | -24.1479 | 0.6300 | 1.5231 | -2.4148 | -2.2625 | -2.4216 | -2.2482 |
|
119 |
+
| 2.1589 | 0.7820 | 23200 | 3.0697 | -22.6570 | -24.1936 | 0.6300 | 1.5367 | -2.4194 | -2.2657 | -2.4323 | -2.2591 |
|
120 |
+
| 3.0872 | 0.7954 | 23600 | 3.0813 | -22.7174 | -24.2489 | 0.6300 | 1.5315 | -2.4249 | -2.2717 | -2.4430 | -2.2683 |
|
121 |
+
| 3.9705 | 0.8089 | 24000 | 3.0806 | -22.7644 | -24.3076 | 0.6300 | 1.5432 | -2.4308 | -2.2764 | -2.4598 | -2.2840 |
|
122 |
+
| 3.5691 | 0.8224 | 24400 | 3.0807 | -22.7627 | -24.2931 | 0.6300 | 1.5304 | -2.4293 | -2.2763 | -2.4621 | -2.2857 |
|
123 |
+
| 1.4467 | 0.8359 | 24800 | 3.0854 | -22.8132 | -24.3525 | 0.6300 | 1.5393 | -2.4353 | -2.2813 | -2.4742 | -2.2963 |
|
124 |
+
| 2.7241 | 0.8494 | 25200 | 3.0862 | -22.8300 | -24.3745 | 0.6300 | 1.5445 | -2.4375 | -2.2830 | -2.4770 | -2.2988 |
|
125 |
+
| 2.7441 | 0.8629 | 25600 | 3.0866 | -22.8450 | -24.3876 | 0.6300 | 1.5427 | -2.4388 | -2.2845 | -2.4823 | -2.3048 |
|
126 |
+
| 1.4801 | 0.8763 | 26000 | 3.0839 | -22.8522 | -24.4010 | 0.6300 | 1.5488 | -2.4401 | -2.2852 | -2.4827 | -2.3057 |
|
127 |
+
| 2.5965 | 0.8898 | 26400 | 3.0841 | -22.8629 | -24.4169 | 0.6300 | 1.5540 | -2.4417 | -2.2863 | -2.4877 | -2.3095 |
|
128 |
+
| 3.6415 | 0.9033 | 26800 | 3.0893 | -22.8830 | -24.4340 | 0.6300 | 1.5510 | -2.4434 | -2.2883 | -2.4894 | -2.3114 |
|
129 |
+
| 2.0584 | 0.9168 | 27200 | 3.0894 | -22.8879 | -24.4268 | 0.6300 | 1.5389 | -2.4427 | -2.2888 | -2.4917 | -2.3134 |
|
130 |
+
| 2.5068 | 0.9303 | 27600 | 3.0896 | -22.8936 | -24.4408 | 0.6300 | 1.5472 | -2.4441 | -2.2894 | -2.4922 | -2.3134 |
|
131 |
+
| 0.677 | 0.9437 | 28000 | 3.0835 | -22.8876 | -24.4472 | 0.6300 | 1.5596 | -2.4447 | -2.2888 | -2.4919 | -2.3134 |
|
132 |
+
| 2.5931 | 0.9572 | 28400 | 3.0875 | -22.8938 | -24.4419 | 0.6300 | 1.5481 | -2.4442 | -2.2894 | -2.4907 | -2.3117 |
|
133 |
+
| 4.4413 | 0.9707 | 28800 | 3.0893 | -22.8952 | -24.4383 | 0.6300 | 1.5431 | -2.4438 | -2.2895 | -2.4914 | -2.3131 |
|
134 |
+
| 2.7584 | 0.9842 | 29200 | 3.0874 | -22.8946 | -24.4410 | 0.6300 | 1.5464 | -2.4441 | -2.2895 | -2.4894 | -2.3112 |
|
135 |
+
| 4.4406 | 0.9977 | 29600 | 3.0877 | -22.8949 | -24.4444 | 0.6300 | 1.5495 | -2.4444 | -2.2895 | -2.4913 | -2.3131 |
|
136 |
+
|
137 |
+
|
138 |
+
### Framework versions
|
139 |
+
|
140 |
+
- Transformers 4.45.1
|
141 |
+
- Pytorch 2.2.2
|
142 |
+
- Datasets 3.0.1
|
143 |
+
- Tokenizers 0.20.0
|
all_results.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"eval_logits/chosen": -2.3130569458007812,
|
4 |
+
"eval_logits/rejected": -2.4918906688690186,
|
5 |
+
"eval_logps/chosen": -2.289309024810791,
|
6 |
+
"eval_logps/rejected": -2.4444260597229004,
|
7 |
+
"eval_loss": 3.0870368480682373,
|
8 |
+
"eval_rewards/accuracies": 0.6299999952316284,
|
9 |
+
"eval_rewards/chosen": -22.893089294433594,
|
10 |
+
"eval_rewards/margins": 1.5511717796325684,
|
11 |
+
"eval_rewards/rejected": -24.444263458251953,
|
12 |
+
"eval_runtime": 12.8093,
|
13 |
+
"eval_samples": 100,
|
14 |
+
"eval_samples_per_second": 7.807,
|
15 |
+
"eval_steps_per_second": 1.952,
|
16 |
+
"total_flos": 0.0,
|
17 |
+
"train_loss": 2.970746964952526,
|
18 |
+
"train_runtime": 37264.6051,
|
19 |
+
"train_samples": 59338,
|
20 |
+
"train_samples_per_second": 1.592,
|
21 |
+
"train_steps_per_second": 0.796
|
22 |
+
}
|
checkpoint-29669/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "HuggingFaceTB/SmolLM-1.7B-Instruct",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"head_dim": 64,
|
11 |
+
"hidden_act": "silu",
|
12 |
+
"hidden_size": 2048,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 8192,
|
15 |
+
"max_position_embeddings": 2048,
|
16 |
+
"mlp_bias": false,
|
17 |
+
"model_type": "llama",
|
18 |
+
"num_attention_heads": 32,
|
19 |
+
"num_hidden_layers": 24,
|
20 |
+
"num_key_value_heads": 32,
|
21 |
+
"pad_token_id": 2,
|
22 |
+
"pretraining_tp": 1,
|
23 |
+
"rms_norm_eps": 1e-05,
|
24 |
+
"rope_scaling": null,
|
25 |
+
"rope_theta": 10000.0,
|
26 |
+
"tie_word_embeddings": true,
|
27 |
+
"torch_dtype": "float32",
|
28 |
+
"transformers_version": "4.45.1",
|
29 |
+
"use_cache": true,
|
30 |
+
"vocab_size": 49152
|
31 |
+
}
|
checkpoint-29669/generation_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"max_new_tokens": 40,
|
6 |
+
"pad_token_id": 2,
|
7 |
+
"transformers_version": "4.45.1"
|
8 |
+
}
|
checkpoint-29669/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoint-29669/model-00001-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba5961dcb97ee4146fd329d5d0b039eb7e0241e5fe4a67c96c8dfbde86739380
|
3 |
+
size 4999906800
|
checkpoint-29669/model-00002-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2df045e8ab92108fc89a739cc55bb85292cadb4116352e3975a7b8ce2b8bc66
|
3 |
+
size 1845623728
|
checkpoint-29669/model.safetensors.index.json
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 6845505536
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
7 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
8 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
9 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
10 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
11 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
12 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
13 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
14 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
15 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
16 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
17 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
18 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
19 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
20 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
21 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
22 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
23 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
24 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
25 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
26 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
27 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
28 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
29 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
30 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
31 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
32 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
33 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
34 |
+
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
35 |
+
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
36 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
37 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
38 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
39 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
40 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
41 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
42 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
43 |
+
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
44 |
+
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
45 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
46 |
+
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
47 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
48 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
49 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
50 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
51 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
52 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
53 |
+
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
54 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
55 |
+
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
56 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
57 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
58 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
59 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
60 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
61 |
+
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
62 |
+
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
63 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
64 |
+
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
65 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
66 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
67 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
68 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
69 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
70 |
+
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
71 |
+
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
72 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
73 |
+
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
74 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
75 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
76 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
77 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
78 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
79 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
80 |
+
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
81 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
82 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
83 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
84 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
85 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
86 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
87 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
88 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
89 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
90 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
91 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
92 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
93 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
94 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
95 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
96 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
97 |
+
"model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
98 |
+
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
99 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
100 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
101 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
102 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
103 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
104 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
105 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
106 |
+
"model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
107 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
108 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
109 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
110 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
111 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
112 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
113 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
114 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
115 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
116 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
117 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
118 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
119 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
120 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
121 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
122 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
123 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
124 |
+
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
125 |
+
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
126 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
127 |
+
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
128 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
129 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
130 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
131 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
132 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
133 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
134 |
+
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
135 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
136 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
137 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
138 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
139 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
140 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
141 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
142 |
+
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
143 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
144 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
145 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
146 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
147 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
148 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
149 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
150 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
151 |
+
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
152 |
+
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
153 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
154 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
155 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
156 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
157 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
158 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
159 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
160 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
161 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
162 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
163 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
164 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
165 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
166 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
167 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
168 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
169 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
170 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
171 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
172 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
173 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
174 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
175 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
176 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
177 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
178 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
179 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
180 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
181 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
182 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
183 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
184 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
185 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
186 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
187 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
188 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
189 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
190 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
191 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
192 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
193 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
194 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
195 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
196 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
197 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
198 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
199 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
200 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
201 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
202 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
203 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
204 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
205 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
206 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
207 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
208 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
209 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
210 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
211 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
212 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
213 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
214 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
215 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
216 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
217 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
218 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
219 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
220 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
221 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
222 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
223 |
+
"model.norm.weight": "model-00002-of-00002.safetensors"
|
224 |
+
}
|
225 |
+
}
|
checkpoint-29669/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1dbaccbc05583fb47fb8cda07d0c6a431ac71bdece447e64cf7d2a5cc09c3c6
|
3 |
+
size 13691201082
|
checkpoint-29669/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:267baed24ffd8664cf6ae854c261b8f91076718da47c1d6de29c00da925cd418
|
3 |
+
size 14244
|
checkpoint-29669/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79d43ab18eb2c28815a0692248ca2de4053087359fb788ac04a6bed01792642c
|
3 |
+
size 1064
|
checkpoint-29669/special_tokens_map.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>"
|
5 |
+
],
|
6 |
+
"bos_token": {
|
7 |
+
"content": "<|im_start|>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"eos_token": {
|
14 |
+
"content": "<|im_end|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
},
|
20 |
+
"pad_token": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false
|
26 |
+
},
|
27 |
+
"unk_token": {
|
28 |
+
"content": "<|endoftext|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false
|
33 |
+
}
|
34 |
+
}
|
checkpoint-29669/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoint-29669/tokenizer_config.json
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"0": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"1": {
|
13 |
+
"content": "<|im_start|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
},
|
20 |
+
"2": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false,
|
26 |
+
"special": true
|
27 |
+
},
|
28 |
+
"3": {
|
29 |
+
"content": "<repo_name>",
|
30 |
+
"lstrip": false,
|
31 |
+
"normalized": false,
|
32 |
+
"rstrip": false,
|
33 |
+
"single_word": false,
|
34 |
+
"special": true
|
35 |
+
},
|
36 |
+
"4": {
|
37 |
+
"content": "<reponame>",
|
38 |
+
"lstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"rstrip": false,
|
41 |
+
"single_word": false,
|
42 |
+
"special": true
|
43 |
+
},
|
44 |
+
"5": {
|
45 |
+
"content": "<file_sep>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false,
|
50 |
+
"special": true
|
51 |
+
},
|
52 |
+
"6": {
|
53 |
+
"content": "<filename>",
|
54 |
+
"lstrip": false,
|
55 |
+
"normalized": false,
|
56 |
+
"rstrip": false,
|
57 |
+
"single_word": false,
|
58 |
+
"special": true
|
59 |
+
},
|
60 |
+
"7": {
|
61 |
+
"content": "<gh_stars>",
|
62 |
+
"lstrip": false,
|
63 |
+
"normalized": false,
|
64 |
+
"rstrip": false,
|
65 |
+
"single_word": false,
|
66 |
+
"special": true
|
67 |
+
},
|
68 |
+
"8": {
|
69 |
+
"content": "<issue_start>",
|
70 |
+
"lstrip": false,
|
71 |
+
"normalized": false,
|
72 |
+
"rstrip": false,
|
73 |
+
"single_word": false,
|
74 |
+
"special": true
|
75 |
+
},
|
76 |
+
"9": {
|
77 |
+
"content": "<issue_comment>",
|
78 |
+
"lstrip": false,
|
79 |
+
"normalized": false,
|
80 |
+
"rstrip": false,
|
81 |
+
"single_word": false,
|
82 |
+
"special": true
|
83 |
+
},
|
84 |
+
"10": {
|
85 |
+
"content": "<issue_closed>",
|
86 |
+
"lstrip": false,
|
87 |
+
"normalized": false,
|
88 |
+
"rstrip": false,
|
89 |
+
"single_word": false,
|
90 |
+
"special": true
|
91 |
+
},
|
92 |
+
"11": {
|
93 |
+
"content": "<jupyter_start>",
|
94 |
+
"lstrip": false,
|
95 |
+
"normalized": false,
|
96 |
+
"rstrip": false,
|
97 |
+
"single_word": false,
|
98 |
+
"special": true
|
99 |
+
},
|
100 |
+
"12": {
|
101 |
+
"content": "<jupyter_text>",
|
102 |
+
"lstrip": false,
|
103 |
+
"normalized": false,
|
104 |
+
"rstrip": false,
|
105 |
+
"single_word": false,
|
106 |
+
"special": true
|
107 |
+
},
|
108 |
+
"13": {
|
109 |
+
"content": "<jupyter_code>",
|
110 |
+
"lstrip": false,
|
111 |
+
"normalized": false,
|
112 |
+
"rstrip": false,
|
113 |
+
"single_word": false,
|
114 |
+
"special": true
|
115 |
+
},
|
116 |
+
"14": {
|
117 |
+
"content": "<jupyter_output>",
|
118 |
+
"lstrip": false,
|
119 |
+
"normalized": false,
|
120 |
+
"rstrip": false,
|
121 |
+
"single_word": false,
|
122 |
+
"special": true
|
123 |
+
},
|
124 |
+
"15": {
|
125 |
+
"content": "<jupyter_script>",
|
126 |
+
"lstrip": false,
|
127 |
+
"normalized": false,
|
128 |
+
"rstrip": false,
|
129 |
+
"single_word": false,
|
130 |
+
"special": true
|
131 |
+
},
|
132 |
+
"16": {
|
133 |
+
"content": "<empty_output>",
|
134 |
+
"lstrip": false,
|
135 |
+
"normalized": false,
|
136 |
+
"rstrip": false,
|
137 |
+
"single_word": false,
|
138 |
+
"special": true
|
139 |
+
}
|
140 |
+
},
|
141 |
+
"additional_special_tokens": [
|
142 |
+
"<|im_start|>",
|
143 |
+
"<|im_end|>"
|
144 |
+
],
|
145 |
+
"bos_token": "<|im_start|>",
|
146 |
+
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
147 |
+
"clean_up_tokenization_spaces": false,
|
148 |
+
"eos_token": "<|im_end|>",
|
149 |
+
"model_max_length": 2048,
|
150 |
+
"pad_token": "<|im_end|>",
|
151 |
+
"tokenizer_class": "GPT2Tokenizer",
|
152 |
+
"unk_token": "<|endoftext|>",
|
153 |
+
"vocab_size": 49152
|
154 |
+
}
|
checkpoint-29669/trainer_state.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoint-29669/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38006d9734a8a9819ede4efac94f7c5e598a5b11271b0e4394dae0b396256caf
|
3 |
+
size 5752
|
checkpoint-29669/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "HuggingFaceTB/SmolLM-1.7B-Instruct",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"head_dim": 64,
|
11 |
+
"hidden_act": "silu",
|
12 |
+
"hidden_size": 2048,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 8192,
|
15 |
+
"max_position_embeddings": 2048,
|
16 |
+
"mlp_bias": false,
|
17 |
+
"model_type": "llama",
|
18 |
+
"num_attention_heads": 32,
|
19 |
+
"num_hidden_layers": 24,
|
20 |
+
"num_key_value_heads": 32,
|
21 |
+
"pad_token_id": 2,
|
22 |
+
"pretraining_tp": 1,
|
23 |
+
"rms_norm_eps": 1e-05,
|
24 |
+
"rope_scaling": null,
|
25 |
+
"rope_theta": 10000.0,
|
26 |
+
"tie_word_embeddings": true,
|
27 |
+
"torch_dtype": "float32",
|
28 |
+
"transformers_version": "4.45.1",
|
29 |
+
"use_cache": true,
|
30 |
+
"vocab_size": 49152
|
31 |
+
}
|
eval_results.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"eval_logits/chosen": -2.3130569458007812,
|
4 |
+
"eval_logits/rejected": -2.4918906688690186,
|
5 |
+
"eval_logps/chosen": -2.289309024810791,
|
6 |
+
"eval_logps/rejected": -2.4444260597229004,
|
7 |
+
"eval_loss": 3.0870368480682373,
|
8 |
+
"eval_rewards/accuracies": 0.6299999952316284,
|
9 |
+
"eval_rewards/chosen": -22.893089294433594,
|
10 |
+
"eval_rewards/margins": 1.5511717796325684,
|
11 |
+
"eval_rewards/rejected": -24.444263458251953,
|
12 |
+
"eval_runtime": 12.8093,
|
13 |
+
"eval_samples": 100,
|
14 |
+
"eval_samples_per_second": 7.807,
|
15 |
+
"eval_steps_per_second": 1.952
|
16 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"max_new_tokens": 40,
|
6 |
+
"pad_token_id": 2,
|
7 |
+
"transformers_version": "4.45.1"
|
8 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model-00001-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba5961dcb97ee4146fd329d5d0b039eb7e0241e5fe4a67c96c8dfbde86739380
|
3 |
+
size 4999906800
|
model-00002-of-00002.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2df045e8ab92108fc89a739cc55bb85292cadb4116352e3975a7b8ce2b8bc66
|
3 |
+
size 1845623728
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 6845505536
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
7 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
8 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
9 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
10 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
11 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
12 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
13 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
14 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
15 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
16 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
17 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
18 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
19 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
20 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
21 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
22 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
23 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
24 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
25 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
26 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
27 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
28 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
29 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
30 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
31 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
32 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
33 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
34 |
+
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
35 |
+
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
36 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
37 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
38 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
39 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
40 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
41 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
42 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
43 |
+
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
44 |
+
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
45 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
46 |
+
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
47 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
48 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
49 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
50 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
51 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
52 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
53 |
+
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
54 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
55 |
+
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
56 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
57 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
58 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
59 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
60 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
61 |
+
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
62 |
+
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
63 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
64 |
+
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
65 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
66 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
67 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
68 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
69 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
70 |
+
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
71 |
+
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
72 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
73 |
+
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
74 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
75 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
76 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
77 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
78 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
79 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
80 |
+
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
81 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
82 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
83 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
84 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
85 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
86 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
87 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
88 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
89 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
90 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
91 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
92 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
93 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
94 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
95 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
96 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
97 |
+
"model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
98 |
+
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
99 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
100 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
101 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
102 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
103 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
104 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
105 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
106 |
+
"model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
107 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
108 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
109 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
110 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
111 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
112 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
113 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
114 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
115 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
116 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
117 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
118 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
119 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
120 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
121 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
122 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
123 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
124 |
+
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
125 |
+
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
126 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
127 |
+
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
128 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
129 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
130 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
131 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
132 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
133 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
134 |
+
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
135 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
136 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
137 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
138 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
139 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
140 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
141 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
142 |
+
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
143 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
144 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
145 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
146 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
147 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
148 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
149 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
150 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
151 |
+
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
152 |
+
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
153 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
154 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
155 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
156 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
157 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
158 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
159 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
160 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
161 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
162 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
163 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
164 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
165 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
166 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
167 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
168 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
169 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
170 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
171 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
172 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
173 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
174 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
175 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
176 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
177 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
178 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
179 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
180 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
181 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
182 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
183 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
184 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
185 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
186 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
187 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
188 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
189 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
190 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
191 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
192 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
193 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
194 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
195 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
196 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
197 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
198 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
199 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
200 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
201 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
202 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
203 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
204 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
205 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
206 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
207 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
208 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
209 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
210 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
211 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
212 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
213 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
214 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
215 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
216 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
217 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
218 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
219 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
220 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
221 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
222 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
223 |
+
"model.norm.weight": "model-00002-of-00002.safetensors"
|
224 |
+
}
|
225 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>"
|
5 |
+
],
|
6 |
+
"bos_token": {
|
7 |
+
"content": "<|im_start|>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"eos_token": {
|
14 |
+
"content": "<|im_end|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
},
|
20 |
+
"pad_token": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false
|
26 |
+
},
|
27 |
+
"unk_token": {
|
28 |
+
"content": "<|endoftext|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false
|
33 |
+
}
|
34 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"0": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"1": {
|
13 |
+
"content": "<|im_start|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
},
|
20 |
+
"2": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false,
|
26 |
+
"special": true
|
27 |
+
},
|
28 |
+
"3": {
|
29 |
+
"content": "<repo_name>",
|
30 |
+
"lstrip": false,
|
31 |
+
"normalized": false,
|
32 |
+
"rstrip": false,
|
33 |
+
"single_word": false,
|
34 |
+
"special": true
|
35 |
+
},
|
36 |
+
"4": {
|
37 |
+
"content": "<reponame>",
|
38 |
+
"lstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"rstrip": false,
|
41 |
+
"single_word": false,
|
42 |
+
"special": true
|
43 |
+
},
|
44 |
+
"5": {
|
45 |
+
"content": "<file_sep>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false,
|
50 |
+
"special": true
|
51 |
+
},
|
52 |
+
"6": {
|
53 |
+
"content": "<filename>",
|
54 |
+
"lstrip": false,
|
55 |
+
"normalized": false,
|
56 |
+
"rstrip": false,
|
57 |
+
"single_word": false,
|
58 |
+
"special": true
|
59 |
+
},
|
60 |
+
"7": {
|
61 |
+
"content": "<gh_stars>",
|
62 |
+
"lstrip": false,
|
63 |
+
"normalized": false,
|
64 |
+
"rstrip": false,
|
65 |
+
"single_word": false,
|
66 |
+
"special": true
|
67 |
+
},
|
68 |
+
"8": {
|
69 |
+
"content": "<issue_start>",
|
70 |
+
"lstrip": false,
|
71 |
+
"normalized": false,
|
72 |
+
"rstrip": false,
|
73 |
+
"single_word": false,
|
74 |
+
"special": true
|
75 |
+
},
|
76 |
+
"9": {
|
77 |
+
"content": "<issue_comment>",
|
78 |
+
"lstrip": false,
|
79 |
+
"normalized": false,
|
80 |
+
"rstrip": false,
|
81 |
+
"single_word": false,
|
82 |
+
"special": true
|
83 |
+
},
|
84 |
+
"10": {
|
85 |
+
"content": "<issue_closed>",
|
86 |
+
"lstrip": false,
|
87 |
+
"normalized": false,
|
88 |
+
"rstrip": false,
|
89 |
+
"single_word": false,
|
90 |
+
"special": true
|
91 |
+
},
|
92 |
+
"11": {
|
93 |
+
"content": "<jupyter_start>",
|
94 |
+
"lstrip": false,
|
95 |
+
"normalized": false,
|
96 |
+
"rstrip": false,
|
97 |
+
"single_word": false,
|
98 |
+
"special": true
|
99 |
+
},
|
100 |
+
"12": {
|
101 |
+
"content": "<jupyter_text>",
|
102 |
+
"lstrip": false,
|
103 |
+
"normalized": false,
|
104 |
+
"rstrip": false,
|
105 |
+
"single_word": false,
|
106 |
+
"special": true
|
107 |
+
},
|
108 |
+
"13": {
|
109 |
+
"content": "<jupyter_code>",
|
110 |
+
"lstrip": false,
|
111 |
+
"normalized": false,
|
112 |
+
"rstrip": false,
|
113 |
+
"single_word": false,
|
114 |
+
"special": true
|
115 |
+
},
|
116 |
+
"14": {
|
117 |
+
"content": "<jupyter_output>",
|
118 |
+
"lstrip": false,
|
119 |
+
"normalized": false,
|
120 |
+
"rstrip": false,
|
121 |
+
"single_word": false,
|
122 |
+
"special": true
|
123 |
+
},
|
124 |
+
"15": {
|
125 |
+
"content": "<jupyter_script>",
|
126 |
+
"lstrip": false,
|
127 |
+
"normalized": false,
|
128 |
+
"rstrip": false,
|
129 |
+
"single_word": false,
|
130 |
+
"special": true
|
131 |
+
},
|
132 |
+
"16": {
|
133 |
+
"content": "<empty_output>",
|
134 |
+
"lstrip": false,
|
135 |
+
"normalized": false,
|
136 |
+
"rstrip": false,
|
137 |
+
"single_word": false,
|
138 |
+
"special": true
|
139 |
+
}
|
140 |
+
},
|
141 |
+
"additional_special_tokens": [
|
142 |
+
"<|im_start|>",
|
143 |
+
"<|im_end|>"
|
144 |
+
],
|
145 |
+
"bos_token": "<|im_start|>",
|
146 |
+
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
147 |
+
"clean_up_tokenization_spaces": false,
|
148 |
+
"eos_token": "<|im_end|>",
|
149 |
+
"model_max_length": 2048,
|
150 |
+
"pad_token": "<|im_end|>",
|
151 |
+
"tokenizer_class": "GPT2Tokenizer",
|
152 |
+
"unk_token": "<|endoftext|>",
|
153 |
+
"vocab_size": 49152
|
154 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.0,
|
3 |
+
"total_flos": 0.0,
|
4 |
+
"train_loss": 2.970746964952526,
|
5 |
+
"train_runtime": 37264.6051,
|
6 |
+
"train_samples": 59338,
|
7 |
+
"train_samples_per_second": 1.592,
|
8 |
+
"train_steps_per_second": 0.796
|
9 |
+
}
|
trainer_state.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38006d9734a8a9819ede4efac94f7c5e598a5b11271b0e4394dae0b396256caf
|
3 |
+
size 5752
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|