875 lines
21 KiB
JSON
875 lines
21 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.2482758620689656,
|
|
"eval_steps": 500,
|
|
"global_step": 120,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.027586206896551724,
|
|
"grad_norm": 0.8273730278015137,
|
|
"learning_rate": 0.0,
|
|
"loss": 2.5179,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.05517241379310345,
|
|
"grad_norm": 0.8761172890663147,
|
|
"learning_rate": 4e-05,
|
|
"loss": 2.8481,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.08275862068965517,
|
|
"grad_norm": 0.7226160764694214,
|
|
"learning_rate": 8e-05,
|
|
"loss": 2.6317,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.1103448275862069,
|
|
"grad_norm": 0.7074100375175476,
|
|
"learning_rate": 0.00012,
|
|
"loss": 2.691,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.13793103448275862,
|
|
"grad_norm": 0.8948147296905518,
|
|
"learning_rate": 0.00016,
|
|
"loss": 2.5618,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.16551724137931034,
|
|
"grad_norm": 0.6676309108734131,
|
|
"learning_rate": 0.0002,
|
|
"loss": 1.7508,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.19310344827586207,
|
|
"grad_norm": 0.7051960825920105,
|
|
"learning_rate": 0.0001982608695652174,
|
|
"loss": 2.1303,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.2206896551724138,
|
|
"grad_norm": 0.9678179025650024,
|
|
"learning_rate": 0.0001965217391304348,
|
|
"loss": 1.8536,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.2482758620689655,
|
|
"grad_norm": 0.9203357696533203,
|
|
"learning_rate": 0.00019478260869565218,
|
|
"loss": 2.055,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.27586206896551724,
|
|
"grad_norm": 0.8674907684326172,
|
|
"learning_rate": 0.00019304347826086958,
|
|
"loss": 2.1066,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.30344827586206896,
|
|
"grad_norm": 1.1215510368347168,
|
|
"learning_rate": 0.00019130434782608697,
|
|
"loss": 2.1572,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.3310344827586207,
|
|
"grad_norm": 0.9455694556236267,
|
|
"learning_rate": 0.00018956521739130436,
|
|
"loss": 1.9253,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.3586206896551724,
|
|
"grad_norm": 1.1137198209762573,
|
|
"learning_rate": 0.00018782608695652175,
|
|
"loss": 2.1465,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.38620689655172413,
|
|
"grad_norm": 1.0382691621780396,
|
|
"learning_rate": 0.00018608695652173914,
|
|
"loss": 1.8506,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.41379310344827586,
|
|
"grad_norm": 0.827080249786377,
|
|
"learning_rate": 0.00018434782608695653,
|
|
"loss": 1.3841,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.4413793103448276,
|
|
"grad_norm": 0.9159547686576843,
|
|
"learning_rate": 0.00018260869565217392,
|
|
"loss": 1.8481,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.4689655172413793,
|
|
"grad_norm": 0.8442085385322571,
|
|
"learning_rate": 0.00018086956521739132,
|
|
"loss": 1.7424,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.496551724137931,
|
|
"grad_norm": 0.864258348941803,
|
|
"learning_rate": 0.0001791304347826087,
|
|
"loss": 1.6131,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.5241379310344828,
|
|
"grad_norm": 1.0820664167404175,
|
|
"learning_rate": 0.0001773913043478261,
|
|
"loss": 1.6129,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.5517241379310345,
|
|
"grad_norm": 0.9673957824707031,
|
|
"learning_rate": 0.0001756521739130435,
|
|
"loss": 1.5257,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.5793103448275863,
|
|
"grad_norm": 0.9112041592597961,
|
|
"learning_rate": 0.00017391304347826088,
|
|
"loss": 1.2875,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.6068965517241379,
|
|
"grad_norm": 0.9094520211219788,
|
|
"learning_rate": 0.00017217391304347827,
|
|
"loss": 1.3703,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.6344827586206897,
|
|
"grad_norm": 0.9438947439193726,
|
|
"learning_rate": 0.00017043478260869566,
|
|
"loss": 1.3091,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.6620689655172414,
|
|
"grad_norm": 1.3191273212432861,
|
|
"learning_rate": 0.00016869565217391306,
|
|
"loss": 1.7087,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.6896551724137931,
|
|
"grad_norm": 0.9469236135482788,
|
|
"learning_rate": 0.00016695652173913042,
|
|
"loss": 1.4919,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.7172413793103448,
|
|
"grad_norm": 1.0983434915542603,
|
|
"learning_rate": 0.00016521739130434784,
|
|
"loss": 1.464,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.7448275862068966,
|
|
"grad_norm": 0.9698247313499451,
|
|
"learning_rate": 0.00016347826086956523,
|
|
"loss": 1.4398,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.7724137931034483,
|
|
"grad_norm": 0.8902468085289001,
|
|
"learning_rate": 0.00016173913043478262,
|
|
"loss": 1.3191,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 0.8863650560379028,
|
|
"learning_rate": 0.00016,
|
|
"loss": 1.2048,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.8275862068965517,
|
|
"grad_norm": 1.0257900953292847,
|
|
"learning_rate": 0.0001582608695652174,
|
|
"loss": 1.2085,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.8551724137931035,
|
|
"grad_norm": 0.9826428294181824,
|
|
"learning_rate": 0.0001565217391304348,
|
|
"loss": 1.1733,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.8827586206896552,
|
|
"grad_norm": 0.9123853445053101,
|
|
"learning_rate": 0.0001547826086956522,
|
|
"loss": 1.4226,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.9103448275862069,
|
|
"grad_norm": 0.8653205633163452,
|
|
"learning_rate": 0.00015304347826086958,
|
|
"loss": 1.3998,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.9379310344827586,
|
|
"grad_norm": 1.2209527492523193,
|
|
"learning_rate": 0.00015130434782608694,
|
|
"loss": 1.2225,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.9655172413793104,
|
|
"grad_norm": 0.977463960647583,
|
|
"learning_rate": 0.00014956521739130436,
|
|
"loss": 1.186,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.993103448275862,
|
|
"grad_norm": 0.8854506611824036,
|
|
"learning_rate": 0.00014782608695652173,
|
|
"loss": 0.9478,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.660280704498291,
|
|
"learning_rate": 0.00014608695652173914,
|
|
"loss": 0.7476,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 1.0275862068965518,
|
|
"grad_norm": 0.9172261953353882,
|
|
"learning_rate": 0.00014434782608695654,
|
|
"loss": 0.959,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 1.0551724137931036,
|
|
"grad_norm": 0.9950329661369324,
|
|
"learning_rate": 0.00014260869565217393,
|
|
"loss": 1.087,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 1.0827586206896551,
|
|
"grad_norm": 0.9052255749702454,
|
|
"learning_rate": 0.00014086956521739132,
|
|
"loss": 1.0335,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 1.110344827586207,
|
|
"grad_norm": 0.8859487771987915,
|
|
"learning_rate": 0.0001391304347826087,
|
|
"loss": 1.0489,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 1.1379310344827587,
|
|
"grad_norm": 0.9165846705436707,
|
|
"learning_rate": 0.0001373913043478261,
|
|
"loss": 1.0135,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 1.1655172413793102,
|
|
"grad_norm": 1.2192325592041016,
|
|
"learning_rate": 0.00013565217391304347,
|
|
"loss": 1.1084,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 1.193103448275862,
|
|
"grad_norm": 1.2101364135742188,
|
|
"learning_rate": 0.00013391304347826088,
|
|
"loss": 1.1635,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 1.2206896551724138,
|
|
"grad_norm": 1.099292516708374,
|
|
"learning_rate": 0.00013217391304347825,
|
|
"loss": 1.1804,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 1.2482758620689656,
|
|
"grad_norm": 0.9990763068199158,
|
|
"learning_rate": 0.00013043478260869567,
|
|
"loss": 0.8802,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 1.2758620689655173,
|
|
"grad_norm": 0.9451124668121338,
|
|
"learning_rate": 0.00012869565217391303,
|
|
"loss": 0.9852,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 1.303448275862069,
|
|
"grad_norm": 0.96523118019104,
|
|
"learning_rate": 0.00012695652173913045,
|
|
"loss": 1.0243,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 1.3310344827586207,
|
|
"grad_norm": 1.0256421566009521,
|
|
"learning_rate": 0.00012521739130434784,
|
|
"loss": 0.8908,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 1.3586206896551725,
|
|
"grad_norm": 1.0647811889648438,
|
|
"learning_rate": 0.00012347826086956523,
|
|
"loss": 0.7224,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 1.386206896551724,
|
|
"grad_norm": 1.042438268661499,
|
|
"learning_rate": 0.00012173913043478263,
|
|
"loss": 0.5707,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 1.4137931034482758,
|
|
"grad_norm": 1.0345197916030884,
|
|
"learning_rate": 0.00012,
|
|
"loss": 0.7433,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 1.4413793103448276,
|
|
"grad_norm": 1.0194092988967896,
|
|
"learning_rate": 0.00011826086956521741,
|
|
"loss": 0.8686,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 1.4689655172413794,
|
|
"grad_norm": 1.0849523544311523,
|
|
"learning_rate": 0.00011652173913043479,
|
|
"loss": 0.9063,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 1.4965517241379311,
|
|
"grad_norm": 1.3685775995254517,
|
|
"learning_rate": 0.00011478260869565218,
|
|
"loss": 0.8633,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 1.524137931034483,
|
|
"grad_norm": 1.2180424928665161,
|
|
"learning_rate": 0.00011304347826086956,
|
|
"loss": 0.8131,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 1.5517241379310345,
|
|
"grad_norm": 1.027662992477417,
|
|
"learning_rate": 0.00011130434782608696,
|
|
"loss": 0.8072,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 1.5793103448275863,
|
|
"grad_norm": 1.0541893243789673,
|
|
"learning_rate": 0.00010956521739130434,
|
|
"loss": 0.5513,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 1.6068965517241378,
|
|
"grad_norm": 0.9840919375419617,
|
|
"learning_rate": 0.00010782608695652174,
|
|
"loss": 0.6515,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 1.6344827586206896,
|
|
"grad_norm": 1.1880444288253784,
|
|
"learning_rate": 0.00010608695652173915,
|
|
"loss": 0.9452,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 1.6620689655172414,
|
|
"grad_norm": 1.1577789783477783,
|
|
"learning_rate": 0.00010434782608695653,
|
|
"loss": 0.5324,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 1.6896551724137931,
|
|
"grad_norm": 1.4066375494003296,
|
|
"learning_rate": 0.00010260869565217393,
|
|
"loss": 0.7674,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 1.717241379310345,
|
|
"grad_norm": 1.5101147890090942,
|
|
"learning_rate": 0.00010086956521739131,
|
|
"loss": 0.7358,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 1.7448275862068967,
|
|
"grad_norm": 1.2288732528686523,
|
|
"learning_rate": 9.91304347826087e-05,
|
|
"loss": 0.762,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 1.7724137931034483,
|
|
"grad_norm": 1.1810815334320068,
|
|
"learning_rate": 9.739130434782609e-05,
|
|
"loss": 0.824,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 1.8,
|
|
"grad_norm": 1.0823071002960205,
|
|
"learning_rate": 9.565217391304348e-05,
|
|
"loss": 0.683,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 1.8275862068965516,
|
|
"grad_norm": 1.1553919315338135,
|
|
"learning_rate": 9.391304347826087e-05,
|
|
"loss": 0.737,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 1.8551724137931034,
|
|
"grad_norm": 1.3099501132965088,
|
|
"learning_rate": 9.217391304347827e-05,
|
|
"loss": 0.4993,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 1.8827586206896552,
|
|
"grad_norm": 1.3969764709472656,
|
|
"learning_rate": 9.043478260869566e-05,
|
|
"loss": 0.6857,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 1.910344827586207,
|
|
"grad_norm": 1.2558094263076782,
|
|
"learning_rate": 8.869565217391305e-05,
|
|
"loss": 0.5347,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 1.9379310344827587,
|
|
"grad_norm": 1.2341969013214111,
|
|
"learning_rate": 8.695652173913044e-05,
|
|
"loss": 0.6651,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 1.9655172413793105,
|
|
"grad_norm": 1.2917416095733643,
|
|
"learning_rate": 8.521739130434783e-05,
|
|
"loss": 0.6177,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 1.993103448275862,
|
|
"grad_norm": 1.2867687940597534,
|
|
"learning_rate": 8.347826086956521e-05,
|
|
"loss": 0.6304,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 2.8276941776275635,
|
|
"learning_rate": 8.173913043478262e-05,
|
|
"loss": 0.829,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 2.027586206896552,
|
|
"grad_norm": 1.1930606365203857,
|
|
"learning_rate": 8e-05,
|
|
"loss": 0.3571,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 2.0551724137931036,
|
|
"grad_norm": 1.364702820777893,
|
|
"learning_rate": 7.82608695652174e-05,
|
|
"loss": 0.4993,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 2.0827586206896553,
|
|
"grad_norm": 1.2684059143066406,
|
|
"learning_rate": 7.652173913043479e-05,
|
|
"loss": 0.4305,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 2.110344827586207,
|
|
"grad_norm": 1.1678532361984253,
|
|
"learning_rate": 7.478260869565218e-05,
|
|
"loss": 0.4332,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 2.1379310344827585,
|
|
"grad_norm": 1.3142938613891602,
|
|
"learning_rate": 7.304347826086957e-05,
|
|
"loss": 0.4618,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 2.1655172413793102,
|
|
"grad_norm": 1.359118938446045,
|
|
"learning_rate": 7.130434782608696e-05,
|
|
"loss": 0.5795,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 2.193103448275862,
|
|
"grad_norm": 1.6325678825378418,
|
|
"learning_rate": 6.956521739130436e-05,
|
|
"loss": 0.4021,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 2.220689655172414,
|
|
"grad_norm": 1.2178771495819092,
|
|
"learning_rate": 6.782608695652173e-05,
|
|
"loss": 0.4262,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 2.2482758620689656,
|
|
"grad_norm": 1.0997027158737183,
|
|
"learning_rate": 6.608695652173912e-05,
|
|
"loss": 0.3029,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 2.2758620689655173,
|
|
"grad_norm": 1.1487294435501099,
|
|
"learning_rate": 6.434782608695652e-05,
|
|
"loss": 0.3831,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 2.303448275862069,
|
|
"grad_norm": 1.3247836828231812,
|
|
"learning_rate": 6.260869565217392e-05,
|
|
"loss": 0.4692,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 2.3310344827586205,
|
|
"grad_norm": 1.1617599725723267,
|
|
"learning_rate": 6.086956521739131e-05,
|
|
"loss": 0.3617,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 2.3586206896551722,
|
|
"grad_norm": 1.2517313957214355,
|
|
"learning_rate": 5.9130434782608704e-05,
|
|
"loss": 0.2729,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 2.386206896551724,
|
|
"grad_norm": 1.1272892951965332,
|
|
"learning_rate": 5.739130434782609e-05,
|
|
"loss": 0.3707,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 2.413793103448276,
|
|
"grad_norm": 1.196664571762085,
|
|
"learning_rate": 5.565217391304348e-05,
|
|
"loss": 0.4032,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 2.4413793103448276,
|
|
"grad_norm": 1.4257408380508423,
|
|
"learning_rate": 5.391304347826087e-05,
|
|
"loss": 0.3194,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 2.4689655172413794,
|
|
"grad_norm": 1.798063039779663,
|
|
"learning_rate": 5.217391304347826e-05,
|
|
"loss": 0.5721,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 2.496551724137931,
|
|
"grad_norm": 1.449183702468872,
|
|
"learning_rate": 5.0434782608695655e-05,
|
|
"loss": 0.3841,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 2.524137931034483,
|
|
"grad_norm": 1.4660217761993408,
|
|
"learning_rate": 4.8695652173913046e-05,
|
|
"loss": 0.4134,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 2.5517241379310347,
|
|
"grad_norm": 1.3259236812591553,
|
|
"learning_rate": 4.695652173913044e-05,
|
|
"loss": 0.5053,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 2.5793103448275865,
|
|
"grad_norm": 1.1987637281417847,
|
|
"learning_rate": 4.521739130434783e-05,
|
|
"loss": 0.3203,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 2.606896551724138,
|
|
"grad_norm": 1.702609896659851,
|
|
"learning_rate": 4.347826086956522e-05,
|
|
"loss": 0.5448,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 2.6344827586206896,
|
|
"grad_norm": 1.2012200355529785,
|
|
"learning_rate": 4.1739130434782605e-05,
|
|
"loss": 0.4839,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 2.6620689655172414,
|
|
"grad_norm": 1.1377613544464111,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.3131,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 2.689655172413793,
|
|
"grad_norm": 1.377774953842163,
|
|
"learning_rate": 3.8260869565217395e-05,
|
|
"loss": 0.3463,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 2.717241379310345,
|
|
"grad_norm": 1.1738471984863281,
|
|
"learning_rate": 3.6521739130434786e-05,
|
|
"loss": 0.2963,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 2.7448275862068967,
|
|
"grad_norm": 1.1475613117218018,
|
|
"learning_rate": 3.478260869565218e-05,
|
|
"loss": 0.2953,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 2.772413793103448,
|
|
"grad_norm": 1.5838022232055664,
|
|
"learning_rate": 3.304347826086956e-05,
|
|
"loss": 0.3852,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 2.8,
|
|
"grad_norm": 1.6446831226348877,
|
|
"learning_rate": 3.130434782608696e-05,
|
|
"loss": 0.5384,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 2.8275862068965516,
|
|
"grad_norm": 1.4402813911437988,
|
|
"learning_rate": 2.9565217391304352e-05,
|
|
"loss": 0.3184,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 2.8551724137931034,
|
|
"grad_norm": 1.3366456031799316,
|
|
"learning_rate": 2.782608695652174e-05,
|
|
"loss": 0.4545,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 2.882758620689655,
|
|
"grad_norm": 1.4988086223602295,
|
|
"learning_rate": 2.608695652173913e-05,
|
|
"loss": 0.2341,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 2.910344827586207,
|
|
"grad_norm": 1.35313880443573,
|
|
"learning_rate": 2.4347826086956523e-05,
|
|
"loss": 0.3555,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 2.9379310344827587,
|
|
"grad_norm": 1.1439647674560547,
|
|
"learning_rate": 2.2608695652173914e-05,
|
|
"loss": 0.299,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 2.9655172413793105,
|
|
"grad_norm": 1.2948118448257446,
|
|
"learning_rate": 2.0869565217391303e-05,
|
|
"loss": 0.4421,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 2.9931034482758623,
|
|
"grad_norm": 1.283116340637207,
|
|
"learning_rate": 1.9130434782608697e-05,
|
|
"loss": 0.368,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 3.516788959503174,
|
|
"learning_rate": 1.739130434782609e-05,
|
|
"loss": 0.3088,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 3.027586206896552,
|
|
"grad_norm": 1.1082452535629272,
|
|
"learning_rate": 1.565217391304348e-05,
|
|
"loss": 0.2828,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 3.0551724137931036,
|
|
"grad_norm": 1.109584927558899,
|
|
"learning_rate": 1.391304347826087e-05,
|
|
"loss": 0.2485,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 3.0827586206896553,
|
|
"grad_norm": 1.0583851337432861,
|
|
"learning_rate": 1.2173913043478261e-05,
|
|
"loss": 0.2712,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 3.110344827586207,
|
|
"grad_norm": 1.1029939651489258,
|
|
"learning_rate": 1.0434782608695651e-05,
|
|
"loss": 0.3537,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 3.1379310344827585,
|
|
"grad_norm": 1.0736896991729736,
|
|
"learning_rate": 8.695652173913044e-06,
|
|
"loss": 0.2121,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 3.1655172413793102,
|
|
"grad_norm": 1.1432276964187622,
|
|
"learning_rate": 6.956521739130435e-06,
|
|
"loss": 0.2371,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 3.193103448275862,
|
|
"grad_norm": 1.4690179824829102,
|
|
"learning_rate": 5.217391304347826e-06,
|
|
"loss": 0.3357,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 3.220689655172414,
|
|
"grad_norm": 1.240258812904358,
|
|
"learning_rate": 3.4782608695652175e-06,
|
|
"loss": 0.2727,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 3.2482758620689656,
|
|
"grad_norm": 1.2821192741394043,
|
|
"learning_rate": 1.7391304347826088e-06,
|
|
"loss": 0.3032,
|
|
"step": 120
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 120,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 4,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.068699077260083e+16,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|