Elfsong commited on
Commit
c7ee26b
·
verified ·
1 Parent(s): 09650b6

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,12 +20,12 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
 
 
24
  "gate_proj",
25
- "down_proj",
26
  "k_proj",
27
- "up_proj",
28
- "o_proj",
29
  "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "down_proj",
24
  "q_proj",
25
+ "o_proj",
26
+ "up_proj",
27
  "gate_proj",
 
28
  "k_proj",
 
 
29
  "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26bb5c693d335a109ecd3fe60b82014ff8dfa61913d7fee21edd21791598ad29
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43ef35dda0f3a5a508eab117460ff6a331211e06b198dc6e1a315d6d8897b434
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dce9ee9dd4e83e04d18e5f46015db2023307957f40df415905d533b778c1ce59
3
- size 168149074
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:784499b9c57080a9aa835529570368af18b60b12be5d1a140d26af708d454530
3
+ size 168155346
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ece9be991d1c749eb41eeb3d0b4d2b0f0e42672da5226547e851b5cc6a20a704
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35297f49e243ed1d027a26f9d8cc60d7b1b3d88f3cde5bada6803ecb49d4e54c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.04523181304183943,
5
  "eval_steps": 500,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -287,6 +287,286 @@
287
  "learning_rate": 4.993395348466544e-05,
288
  "loss": 0.7012,
289
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  }
291
  ],
292
  "logging_steps": 5,
@@ -306,7 +586,7 @@
306
  "attributes": {}
307
  }
308
  },
309
- "total_flos": 2.1310339876808294e+17,
310
  "train_batch_size": 1,
311
  "trial_name": null,
312
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.09046362608367886,
5
  "eval_steps": 500,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
287
  "learning_rate": 4.993395348466544e-05,
288
  "loss": 0.7012,
289
  "step": 200
290
+ },
291
+ {
292
+ "epoch": 0.046362608367885416,
293
+ "grad_norm": 0.28732138872146606,
294
+ "learning_rate": 4.992718700485085e-05,
295
+ "loss": 0.7247,
296
+ "step": 205
297
+ },
298
+ {
299
+ "epoch": 0.047493403693931395,
300
+ "grad_norm": 0.2657299339771271,
301
+ "learning_rate": 4.99200911095478e-05,
302
+ "loss": 0.7247,
303
+ "step": 210
304
+ },
305
+ {
306
+ "epoch": 0.04862419901997738,
307
+ "grad_norm": 0.30124104022979736,
308
+ "learning_rate": 4.991266589252933e-05,
309
+ "loss": 0.7001,
310
+ "step": 215
311
+ },
312
+ {
313
+ "epoch": 0.049754994346023367,
314
+ "grad_norm": 0.3533799946308136,
315
+ "learning_rate": 4.990491145192049e-05,
316
+ "loss": 0.7714,
317
+ "step": 220
318
+ },
319
+ {
320
+ "epoch": 0.05088578967206935,
321
+ "grad_norm": 0.29441332817077637,
322
+ "learning_rate": 4.989682789019706e-05,
323
+ "loss": 0.7338,
324
+ "step": 225
325
+ },
326
+ {
327
+ "epoch": 0.05201658499811534,
328
+ "grad_norm": 0.2670339345932007,
329
+ "learning_rate": 4.988841531418418e-05,
330
+ "loss": 0.719,
331
+ "step": 230
332
+ },
333
+ {
334
+ "epoch": 0.053147380324161324,
335
+ "grad_norm": 0.44572877883911133,
336
+ "learning_rate": 4.9879673835054955e-05,
337
+ "loss": 0.7315,
338
+ "step": 235
339
+ },
340
+ {
341
+ "epoch": 0.05427817565020731,
342
+ "grad_norm": 0.29553067684173584,
343
+ "learning_rate": 4.9870603568328985e-05,
344
+ "loss": 0.7495,
345
+ "step": 240
346
+ },
347
+ {
348
+ "epoch": 0.055408970976253295,
349
+ "grad_norm": 0.26393231749534607,
350
+ "learning_rate": 4.986120463387084e-05,
351
+ "loss": 0.6637,
352
+ "step": 245
353
+ },
354
+ {
355
+ "epoch": 0.05653976630229928,
356
+ "grad_norm": 0.35982418060302734,
357
+ "learning_rate": 4.985147715588845e-05,
358
+ "loss": 0.7571,
359
+ "step": 250
360
+ },
361
+ {
362
+ "epoch": 0.05767056162834527,
363
+ "grad_norm": 0.38977113366127014,
364
+ "learning_rate": 4.9841421262931506e-05,
365
+ "loss": 0.7551,
366
+ "step": 255
367
+ },
368
+ {
369
+ "epoch": 0.05880135695439125,
370
+ "grad_norm": 0.28935956954956055,
371
+ "learning_rate": 4.983103708788972e-05,
372
+ "loss": 0.7863,
373
+ "step": 260
374
+ },
375
+ {
376
+ "epoch": 0.05993215228043724,
377
+ "grad_norm": 0.34443530440330505,
378
+ "learning_rate": 4.98203247679911e-05,
379
+ "loss": 0.8106,
380
+ "step": 265
381
+ },
382
+ {
383
+ "epoch": 0.061062947606483224,
384
+ "grad_norm": 0.4763427674770355,
385
+ "learning_rate": 4.980928444480011e-05,
386
+ "loss": 0.7729,
387
+ "step": 270
388
+ },
389
+ {
390
+ "epoch": 0.06219374293252921,
391
+ "grad_norm": 0.2860422730445862,
392
+ "learning_rate": 4.9797916264215824e-05,
393
+ "loss": 0.7593,
394
+ "step": 275
395
+ },
396
+ {
397
+ "epoch": 0.0633245382585752,
398
+ "grad_norm": 0.28870680928230286,
399
+ "learning_rate": 4.978622037647e-05,
400
+ "loss": 0.7574,
401
+ "step": 280
402
+ },
403
+ {
404
+ "epoch": 0.06445533358462119,
405
+ "grad_norm": 0.40277180075645447,
406
+ "learning_rate": 4.9774196936125056e-05,
407
+ "loss": 0.799,
408
+ "step": 285
409
+ },
410
+ {
411
+ "epoch": 0.06558612891066717,
412
+ "grad_norm": 0.3290288746356964,
413
+ "learning_rate": 4.9761846102072065e-05,
414
+ "loss": 0.7519,
415
+ "step": 290
416
+ },
417
+ {
418
+ "epoch": 0.06671692423671316,
419
+ "grad_norm": 0.3139791190624237,
420
+ "learning_rate": 4.9749168037528635e-05,
421
+ "loss": 0.6837,
422
+ "step": 295
423
+ },
424
+ {
425
+ "epoch": 0.06784771956275915,
426
+ "grad_norm": 0.30802035331726074,
427
+ "learning_rate": 4.9736162910036785e-05,
428
+ "loss": 0.7662,
429
+ "step": 300
430
+ },
431
+ {
432
+ "epoch": 0.06897851488880513,
433
+ "grad_norm": 0.34561124444007874,
434
+ "learning_rate": 4.972283089146067e-05,
435
+ "loss": 0.6897,
436
+ "step": 305
437
+ },
438
+ {
439
+ "epoch": 0.07010931021485112,
440
+ "grad_norm": 0.3372039198875427,
441
+ "learning_rate": 4.970917215798438e-05,
442
+ "loss": 0.7344,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 0.0712401055408971,
447
+ "grad_norm": 0.41160914301872253,
448
+ "learning_rate": 4.9695186890109567e-05,
449
+ "loss": 0.832,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 0.07237090086694309,
454
+ "grad_norm": 0.2914057672023773,
455
+ "learning_rate": 4.968087527265306e-05,
456
+ "loss": 0.7113,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 0.07350169619298907,
461
+ "grad_norm": 0.3247675597667694,
462
+ "learning_rate": 4.966623749474445e-05,
463
+ "loss": 0.6996,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 0.07463249151903506,
468
+ "grad_norm": 0.435735285282135,
469
+ "learning_rate": 4.9651273749823546e-05,
470
+ "loss": 0.8236,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 0.07576328684508105,
475
+ "grad_norm": 0.3213053047657013,
476
+ "learning_rate": 4.963598423563788e-05,
477
+ "loss": 0.7012,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 0.07689408217112703,
482
+ "grad_norm": 0.3745056390762329,
483
+ "learning_rate": 4.962036915424004e-05,
484
+ "loss": 0.7018,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 0.07802487749717302,
489
+ "grad_norm": 0.28368842601776123,
490
+ "learning_rate": 4.960442871198503e-05,
491
+ "loss": 0.7084,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 0.079155672823219,
496
+ "grad_norm": 0.2621799409389496,
497
+ "learning_rate": 4.958816311952752e-05,
498
+ "loss": 0.7217,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 0.08028646814926499,
503
+ "grad_norm": 0.25561287999153137,
504
+ "learning_rate": 4.95715725918191e-05,
505
+ "loss": 0.7616,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 0.08141726347531097,
510
+ "grad_norm": 0.3495071828365326,
511
+ "learning_rate": 4.9554657348105385e-05,
512
+ "loss": 0.7061,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 0.08254805880135696,
517
+ "grad_norm": 0.3490068018436432,
518
+ "learning_rate": 4.953741761192317e-05,
519
+ "loss": 0.7809,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 0.08367885412740295,
524
+ "grad_norm": 0.39416739344596863,
525
+ "learning_rate": 4.9519853611097434e-05,
526
+ "loss": 0.7282,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 0.08480964945344893,
531
+ "grad_norm": 0.2763444185256958,
532
+ "learning_rate": 4.950196557773837e-05,
533
+ "loss": 0.7262,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 0.08594044477949492,
538
+ "grad_norm": 0.29107871651649475,
539
+ "learning_rate": 4.948375374823828e-05,
540
+ "loss": 0.7346,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 0.0870712401055409,
545
+ "grad_norm": 0.28965339064598083,
546
+ "learning_rate": 4.946521836326847e-05,
547
+ "loss": 0.6768,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 0.08820203543158689,
552
+ "grad_norm": 0.31072792410850525,
553
+ "learning_rate": 4.9446359667776065e-05,
554
+ "loss": 0.7277,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 0.08933283075763288,
559
+ "grad_norm": 0.2789427936077118,
560
+ "learning_rate": 4.9427177910980794e-05,
561
+ "loss": 0.7481,
562
+ "step": 395
563
+ },
564
+ {
565
+ "epoch": 0.09046362608367886,
566
+ "grad_norm": 0.2573710083961487,
567
+ "learning_rate": 4.9407673346371644e-05,
568
+ "loss": 0.7077,
569
+ "step": 400
570
  }
571
  ],
572
  "logging_steps": 5,
 
586
  "attributes": {}
587
  }
588
  },
589
+ "total_flos": 4.2708606055664845e+17,
590
  "train_batch_size": 1,
591
  "trial_name": null,
592
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9319d9c079514e379e4c3967b718f85a19bc1f8b61112bad04a43a46d5d6afe2
3
- size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c29b6114cee3a1eb0c6657320d373e2561ec03a011bc688ec4cc2b0b164a6831
3
+ size 5816