665 lines
16 KiB
JSON
665 lines
16 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.606060606060606,
|
|
"eval_steps": 500,
|
|
"global_step": 90,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.04040404040404041,
|
|
"grad_norm": 0.7893974184989929,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.1676,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.08080808080808081,
|
|
"grad_norm": 1.0626084804534912,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.2393,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.12121212121212122,
|
|
"grad_norm": 1.0501738786697388,
|
|
"learning_rate": 8e-05,
|
|
"loss": 0.2211,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 0.21509379148483276,
|
|
"learning_rate": 0.00012,
|
|
"loss": 0.066,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.20202020202020202,
|
|
"grad_norm": 0.17153297364711761,
|
|
"learning_rate": 0.00016,
|
|
"loss": 0.0617,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.24242424242424243,
|
|
"grad_norm": 0.7136198878288269,
|
|
"learning_rate": 0.0002,
|
|
"loss": 0.1641,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.2828282828282828,
|
|
"grad_norm": 0.3403281569480896,
|
|
"learning_rate": 0.00019764705882352942,
|
|
"loss": 0.0778,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 0.589501142501831,
|
|
"learning_rate": 0.00019529411764705883,
|
|
"loss": 0.1492,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.36363636363636365,
|
|
"grad_norm": 0.42319923639297485,
|
|
"learning_rate": 0.00019294117647058825,
|
|
"loss": 0.1345,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.40404040404040403,
|
|
"grad_norm": 0.40605250000953674,
|
|
"learning_rate": 0.00019058823529411766,
|
|
"loss": 0.1056,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 0.5076654553413391,
|
|
"learning_rate": 0.00018823529411764707,
|
|
"loss": 0.1595,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 0.37221914529800415,
|
|
"learning_rate": 0.00018588235294117648,
|
|
"loss": 0.1089,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.5252525252525253,
|
|
"grad_norm": 0.49597060680389404,
|
|
"learning_rate": 0.0001835294117647059,
|
|
"loss": 0.0955,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.5656565656565656,
|
|
"grad_norm": 0.3603276312351227,
|
|
"learning_rate": 0.0001811764705882353,
|
|
"loss": 0.0874,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.6060606060606061,
|
|
"grad_norm": 0.3946147561073303,
|
|
"learning_rate": 0.00017882352941176472,
|
|
"loss": 0.0834,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 0.31046634912490845,
|
|
"learning_rate": 0.00017647058823529413,
|
|
"loss": 0.0845,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.6868686868686869,
|
|
"grad_norm": 0.31403785943984985,
|
|
"learning_rate": 0.00017411764705882354,
|
|
"loss": 0.0942,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.7272727272727273,
|
|
"grad_norm": 0.32949572801589966,
|
|
"learning_rate": 0.00017176470588235293,
|
|
"loss": 0.0976,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.7676767676767676,
|
|
"grad_norm": 0.8442594408988953,
|
|
"learning_rate": 0.00016941176470588237,
|
|
"loss": 0.1502,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 0.8818703889846802,
|
|
"learning_rate": 0.00016705882352941178,
|
|
"loss": 0.1008,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.8484848484848485,
|
|
"grad_norm": 1.1039226055145264,
|
|
"learning_rate": 0.0001647058823529412,
|
|
"loss": 0.0908,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 0.616620659828186,
|
|
"learning_rate": 0.0001623529411764706,
|
|
"loss": 0.1022,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.9292929292929293,
|
|
"grad_norm": 0.43923619389533997,
|
|
"learning_rate": 0.00016,
|
|
"loss": 0.1095,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.668854296207428,
|
|
"learning_rate": 0.00015764705882352943,
|
|
"loss": 0.0994,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.54339200258255,
|
|
"learning_rate": 0.00015529411764705884,
|
|
"loss": 0.0822,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 1.0404040404040404,
|
|
"grad_norm": 0.52640700340271,
|
|
"learning_rate": 0.00015294117647058822,
|
|
"loss": 0.091,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 1.0808080808080809,
|
|
"grad_norm": 0.243753120303154,
|
|
"learning_rate": 0.00015058823529411766,
|
|
"loss": 0.0753,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 1.121212121212121,
|
|
"grad_norm": 0.16135047376155853,
|
|
"learning_rate": 0.00014823529411764707,
|
|
"loss": 0.0818,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 0.9692177772521973,
|
|
"learning_rate": 0.00014588235294117646,
|
|
"loss": 0.1394,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 1.202020202020202,
|
|
"grad_norm": 0.48012155294418335,
|
|
"learning_rate": 0.0001435294117647059,
|
|
"loss": 0.095,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 1.2424242424242424,
|
|
"grad_norm": 0.3694566786289215,
|
|
"learning_rate": 0.0001411764705882353,
|
|
"loss": 0.0776,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 1.2828282828282829,
|
|
"grad_norm": 0.604898989200592,
|
|
"learning_rate": 0.00013882352941176472,
|
|
"loss": 0.0727,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 0.6668853163719177,
|
|
"learning_rate": 0.00013647058823529413,
|
|
"loss": 0.1211,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 1.3636363636363638,
|
|
"grad_norm": 0.8030984401702881,
|
|
"learning_rate": 0.00013411764705882352,
|
|
"loss": 0.0724,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 1.404040404040404,
|
|
"grad_norm": 0.5926573872566223,
|
|
"learning_rate": 0.00013176470588235296,
|
|
"loss": 0.0671,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 1.4444444444444444,
|
|
"grad_norm": 0.20058207213878632,
|
|
"learning_rate": 0.00012941176470588237,
|
|
"loss": 0.0686,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 0.30539166927337646,
|
|
"learning_rate": 0.00012705882352941175,
|
|
"loss": 0.0968,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 1.5252525252525253,
|
|
"grad_norm": 0.6506590247154236,
|
|
"learning_rate": 0.0001247058823529412,
|
|
"loss": 0.0972,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 1.5656565656565657,
|
|
"grad_norm": 0.647463858127594,
|
|
"learning_rate": 0.0001223529411764706,
|
|
"loss": 0.0786,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 1.606060606060606,
|
|
"grad_norm": 0.4133020043373108,
|
|
"learning_rate": 0.00012,
|
|
"loss": 0.0985,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 0.798978328704834,
|
|
"learning_rate": 0.00011764705882352942,
|
|
"loss": 0.0993,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 1.6868686868686869,
|
|
"grad_norm": 0.438997358083725,
|
|
"learning_rate": 0.00011529411764705881,
|
|
"loss": 0.1002,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 1.7272727272727273,
|
|
"grad_norm": 0.2584928870201111,
|
|
"learning_rate": 0.00011294117647058824,
|
|
"loss": 0.0851,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 1.7676767676767677,
|
|
"grad_norm": 0.259726345539093,
|
|
"learning_rate": 0.00011058823529411766,
|
|
"loss": 0.0859,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 0.44141435623168945,
|
|
"learning_rate": 0.00010823529411764706,
|
|
"loss": 0.1094,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 1.8484848484848486,
|
|
"grad_norm": 0.5731039047241211,
|
|
"learning_rate": 0.00010588235294117647,
|
|
"loss": 0.1231,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 1.8888888888888888,
|
|
"grad_norm": 0.3471589684486389,
|
|
"learning_rate": 0.0001035294117647059,
|
|
"loss": 0.0773,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 1.9292929292929293,
|
|
"grad_norm": 0.2618795335292816,
|
|
"learning_rate": 0.0001011764705882353,
|
|
"loss": 0.0832,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.4264814257621765,
|
|
"learning_rate": 9.882352941176471e-05,
|
|
"loss": 0.0925,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.5760068297386169,
|
|
"learning_rate": 9.647058823529412e-05,
|
|
"loss": 0.0778,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 2.04040404040404,
|
|
"grad_norm": 0.22954879701137543,
|
|
"learning_rate": 9.411764705882353e-05,
|
|
"loss": 0.0733,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 2.080808080808081,
|
|
"grad_norm": 0.21470747888088226,
|
|
"learning_rate": 9.176470588235295e-05,
|
|
"loss": 0.0716,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 2.121212121212121,
|
|
"grad_norm": 0.2303597778081894,
|
|
"learning_rate": 8.941176470588236e-05,
|
|
"loss": 0.0738,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 0.2480212152004242,
|
|
"learning_rate": 8.705882352941177e-05,
|
|
"loss": 0.0791,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 2.202020202020202,
|
|
"grad_norm": 0.1986403614282608,
|
|
"learning_rate": 8.470588235294118e-05,
|
|
"loss": 0.0673,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 2.242424242424242,
|
|
"grad_norm": 0.2764434218406677,
|
|
"learning_rate": 8.23529411764706e-05,
|
|
"loss": 0.0764,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 2.282828282828283,
|
|
"grad_norm": 0.45056474208831787,
|
|
"learning_rate": 8e-05,
|
|
"loss": 0.0683,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.37713348865509033,
|
|
"learning_rate": 7.764705882352942e-05,
|
|
"loss": 0.0785,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 2.3636363636363638,
|
|
"grad_norm": 0.19750048220157623,
|
|
"learning_rate": 7.529411764705883e-05,
|
|
"loss": 0.0719,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 2.404040404040404,
|
|
"grad_norm": 0.23382727801799774,
|
|
"learning_rate": 7.294117647058823e-05,
|
|
"loss": 0.0769,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 2.4444444444444446,
|
|
"grad_norm": 0.43519431352615356,
|
|
"learning_rate": 7.058823529411765e-05,
|
|
"loss": 0.0953,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.8023049831390381,
|
|
"learning_rate": 6.823529411764707e-05,
|
|
"loss": 0.0978,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 2.525252525252525,
|
|
"grad_norm": 0.5448880195617676,
|
|
"learning_rate": 6.588235294117648e-05,
|
|
"loss": 0.0786,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 2.5656565656565657,
|
|
"grad_norm": 0.5319021940231323,
|
|
"learning_rate": 6.352941176470588e-05,
|
|
"loss": 0.0837,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 2.606060606060606,
|
|
"grad_norm": 0.3056259751319885,
|
|
"learning_rate": 6.11764705882353e-05,
|
|
"loss": 0.0716,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 0.3007633686065674,
|
|
"learning_rate": 5.882352941176471e-05,
|
|
"loss": 0.0781,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 2.686868686868687,
|
|
"grad_norm": 0.517301619052887,
|
|
"learning_rate": 5.647058823529412e-05,
|
|
"loss": 0.0751,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 2.7272727272727275,
|
|
"grad_norm": 0.31967368721961975,
|
|
"learning_rate": 5.411764705882353e-05,
|
|
"loss": 0.0948,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 2.7676767676767677,
|
|
"grad_norm": 0.22360506653785706,
|
|
"learning_rate": 5.176470588235295e-05,
|
|
"loss": 0.0721,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.8932453393936157,
|
|
"learning_rate": 4.9411764705882355e-05,
|
|
"loss": 0.083,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 2.8484848484848486,
|
|
"grad_norm": 0.17888718843460083,
|
|
"learning_rate": 4.705882352941177e-05,
|
|
"loss": 0.0759,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 2.888888888888889,
|
|
"grad_norm": 0.2312222719192505,
|
|
"learning_rate": 4.470588235294118e-05,
|
|
"loss": 0.0819,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 2.929292929292929,
|
|
"grad_norm": 0.3377898335456848,
|
|
"learning_rate": 4.235294117647059e-05,
|
|
"loss": 0.091,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 0.22434180974960327,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0656,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.4803672432899475,
|
|
"learning_rate": 3.7647058823529415e-05,
|
|
"loss": 0.0654,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 3.04040404040404,
|
|
"grad_norm": 0.18344801664352417,
|
|
"learning_rate": 3.529411764705883e-05,
|
|
"loss": 0.0681,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 3.080808080808081,
|
|
"grad_norm": 0.18728883564472198,
|
|
"learning_rate": 3.294117647058824e-05,
|
|
"loss": 0.0641,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 3.121212121212121,
|
|
"grad_norm": 0.509119987487793,
|
|
"learning_rate": 3.058823529411765e-05,
|
|
"loss": 0.0777,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 3.1616161616161618,
|
|
"grad_norm": 0.16499896347522736,
|
|
"learning_rate": 2.823529411764706e-05,
|
|
"loss": 0.0578,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 3.202020202020202,
|
|
"grad_norm": 0.17131227254867554,
|
|
"learning_rate": 2.5882352941176475e-05,
|
|
"loss": 0.0597,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 3.242424242424242,
|
|
"grad_norm": 0.17663079500198364,
|
|
"learning_rate": 2.3529411764705884e-05,
|
|
"loss": 0.0674,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 3.282828282828283,
|
|
"grad_norm": 0.18466462194919586,
|
|
"learning_rate": 2.1176470588235296e-05,
|
|
"loss": 0.062,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 3.323232323232323,
|
|
"grad_norm": 0.1754070371389389,
|
|
"learning_rate": 1.8823529411764708e-05,
|
|
"loss": 0.0703,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 3.3636363636363638,
|
|
"grad_norm": 0.16022254526615143,
|
|
"learning_rate": 1.647058823529412e-05,
|
|
"loss": 0.0651,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 3.404040404040404,
|
|
"grad_norm": 0.16330307722091675,
|
|
"learning_rate": 1.411764705882353e-05,
|
|
"loss": 0.0566,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 3.4444444444444446,
|
|
"grad_norm": 0.38002651929855347,
|
|
"learning_rate": 1.1764705882352942e-05,
|
|
"loss": 0.0726,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 3.484848484848485,
|
|
"grad_norm": 0.17870256304740906,
|
|
"learning_rate": 9.411764705882354e-06,
|
|
"loss": 0.0658,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 3.525252525252525,
|
|
"grad_norm": 0.19073323905467987,
|
|
"learning_rate": 7.058823529411765e-06,
|
|
"loss": 0.0645,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 3.5656565656565657,
|
|
"grad_norm": 0.1769099086523056,
|
|
"learning_rate": 4.705882352941177e-06,
|
|
"loss": 0.0615,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 3.606060606060606,
|
|
"grad_norm": 0.2047484815120697,
|
|
"learning_rate": 2.3529411764705885e-06,
|
|
"loss": 0.0713,
|
|
"step": 90
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 90,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 4,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3309241116770304.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|