Qinghao commited on
Commit
6d9a361
·
verified ·
1 Parent(s): 5489eca

Model save

Browse files
README.md CHANGED
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/eLLM-han2024/Qwen2.5-7B-Open-R1-Distill-Debug/runs/zgzq94od)
30
 
31
 
32
  This model was trained with SFT.
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/eLLM-han2024/Qwen2.5-7B-Open-R1-Distill-Debug/runs/kzcifeec)
30
 
31
 
32
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "total_flos": 4.184202049014989e+16,
3
- "train_loss": 1.4971793293952942,
4
- "train_runtime": 181.2294,
5
- "train_samples_per_second": 3.531,
6
- "train_steps_per_second": 0.055
7
  }
 
1
  {
2
+ "total_flos": 8665747207225344.0,
3
+ "train_loss": 1.8449461460113525,
4
+ "train_runtime": 310.7619,
5
+ "train_samples_per_second": 0.412,
6
+ "train_steps_per_second": 0.006
7
  }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.49.0"
6
+ }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "total_flos": 4.184202049014989e+16,
3
- "train_loss": 1.4971793293952942,
4
- "train_runtime": 181.2294,
5
- "train_samples_per_second": 3.531,
6
- "train_steps_per_second": 0.055
7
  }
 
1
  {
2
+ "total_flos": 8665747207225344.0,
3
+ "train_loss": 1.8449461460113525,
4
+ "train_runtime": 310.7619,
5
+ "train_samples_per_second": 0.412,
6
+ "train_steps_per_second": 0.006
7
  }
trainer_state.json CHANGED
@@ -1,95 +1,39 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.006825938566552901,
5
  "eval_steps": 500,
6
- "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0006825938566552901,
13
- "grad_norm": 28.913602828979492,
14
  "learning_rate": 1e-05,
15
  "loss": 1.9212,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0013651877133105802,
20
- "grad_norm": 26.408185958862305,
21
- "learning_rate": 9.698463103929542e-06,
22
  "loss": 1.7687,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.0020477815699658703,
27
- "grad_norm": 12.941144943237305,
28
- "learning_rate": 8.83022221559489e-06,
29
- "loss": 1.8589,
30
- "step": 3
31
- },
32
- {
33
- "epoch": 0.0027303754266211604,
34
- "grad_norm": 11.555084228515625,
35
- "learning_rate": 7.500000000000001e-06,
36
- "loss": 1.7133,
37
- "step": 4
38
- },
39
- {
40
- "epoch": 0.0034129692832764505,
41
- "grad_norm": 6.055721759796143,
42
- "learning_rate": 5.8682408883346535e-06,
43
- "loss": 1.3889,
44
- "step": 5
45
- },
46
- {
47
- "epoch": 0.004095563139931741,
48
- "grad_norm": 4.867063522338867,
49
- "learning_rate": 4.131759111665349e-06,
50
- "loss": 1.3631,
51
- "step": 6
52
- },
53
- {
54
- "epoch": 0.00477815699658703,
55
- "grad_norm": 6.896249771118164,
56
- "learning_rate": 2.5000000000000015e-06,
57
- "loss": 1.1494,
58
- "step": 7
59
- },
60
- {
61
- "epoch": 0.005460750853242321,
62
- "grad_norm": 3.4688620567321777,
63
- "learning_rate": 1.1697777844051105e-06,
64
- "loss": 1.1455,
65
- "step": 8
66
- },
67
- {
68
- "epoch": 0.0061433447098976105,
69
- "grad_norm": 4.858311653137207,
70
- "learning_rate": 3.015368960704584e-07,
71
- "loss": 1.2792,
72
- "step": 9
73
- },
74
- {
75
- "epoch": 0.006825938566552901,
76
- "grad_norm": 4.7386555671691895,
77
- "learning_rate": 0.0,
78
- "loss": 1.3836,
79
- "step": 10
80
- },
81
- {
82
- "epoch": 0.006825938566552901,
83
- "step": 10,
84
- "total_flos": 4.184202049014989e+16,
85
- "train_loss": 1.4971793293952942,
86
- "train_runtime": 181.2294,
87
- "train_samples_per_second": 3.531,
88
- "train_steps_per_second": 0.055
89
  }
90
  ],
91
  "logging_steps": 1.0,
92
- "max_steps": 10,
93
  "num_input_tokens_seen": 0,
94
  "num_train_epochs": 1,
95
  "save_steps": 500,
@@ -105,7 +49,7 @@
105
  "attributes": {}
106
  }
107
  },
108
- "total_flos": 4.184202049014989e+16,
109
  "train_batch_size": 8,
110
  "trial_name": null,
111
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0013651877133105802,
5
  "eval_steps": 500,
6
+ "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0006825938566552901,
13
+ "grad_norm": 29.01302146911621,
14
  "learning_rate": 1e-05,
15
  "loss": 1.9212,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0013651877133105802,
20
+ "grad_norm": 26.415725708007812,
21
+ "learning_rate": 0.0,
22
  "loss": 1.7687,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.0013651877133105802,
27
+ "step": 2,
28
+ "total_flos": 8665747207225344.0,
29
+ "train_loss": 1.8449461460113525,
30
+ "train_runtime": 310.7619,
31
+ "train_samples_per_second": 0.412,
32
+ "train_steps_per_second": 0.006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
34
  ],
35
  "logging_steps": 1.0,
36
+ "max_steps": 2,
37
  "num_input_tokens_seen": 0,
38
  "num_train_epochs": 1,
39
  "save_steps": 500,
 
49
  "attributes": {}
50
  }
51
  },
52
+ "total_flos": 8665747207225344.0,
53
  "train_batch_size": 8,
54
  "trial_name": null,
55
  "trial_params": null