{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.606060606060606, "eval_steps": 500, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04040404040404041, "grad_norm": 0.7893974184989929, "learning_rate": 0.0, "loss": 0.1676, "step": 1 }, { "epoch": 0.08080808080808081, "grad_norm": 1.0626084804534912, "learning_rate": 4e-05, "loss": 0.2393, "step": 2 }, { "epoch": 0.12121212121212122, "grad_norm": 1.0501738786697388, "learning_rate": 8e-05, "loss": 0.2211, "step": 3 }, { "epoch": 0.16161616161616163, "grad_norm": 0.21509379148483276, "learning_rate": 0.00012, "loss": 0.066, "step": 4 }, { "epoch": 0.20202020202020202, "grad_norm": 0.17153297364711761, "learning_rate": 0.00016, "loss": 0.0617, "step": 5 }, { "epoch": 0.24242424242424243, "grad_norm": 0.7136198878288269, "learning_rate": 0.0002, "loss": 0.1641, "step": 6 }, { "epoch": 0.2828282828282828, "grad_norm": 0.3403281569480896, "learning_rate": 0.00019764705882352942, "loss": 0.0778, "step": 7 }, { "epoch": 0.32323232323232326, "grad_norm": 0.589501142501831, "learning_rate": 0.00019529411764705883, "loss": 0.1492, "step": 8 }, { "epoch": 0.36363636363636365, "grad_norm": 0.42319923639297485, "learning_rate": 0.00019294117647058825, "loss": 0.1345, "step": 9 }, { "epoch": 0.40404040404040403, "grad_norm": 0.40605250000953674, "learning_rate": 0.00019058823529411766, "loss": 0.1056, "step": 10 }, { "epoch": 0.4444444444444444, "grad_norm": 0.5076654553413391, "learning_rate": 0.00018823529411764707, "loss": 0.1595, "step": 11 }, { "epoch": 0.48484848484848486, "grad_norm": 0.37221914529800415, "learning_rate": 0.00018588235294117648, "loss": 0.1089, "step": 12 }, { "epoch": 0.5252525252525253, "grad_norm": 0.49597060680389404, "learning_rate": 0.0001835294117647059, "loss": 0.0955, "step": 13 }, { "epoch": 0.5656565656565656, "grad_norm": 0.3603276312351227, "learning_rate": 0.0001811764705882353, "loss": 0.0874, "step": 14 }, { "epoch": 0.6060606060606061, "grad_norm": 0.3946147561073303, "learning_rate": 0.00017882352941176472, "loss": 0.0834, "step": 15 }, { "epoch": 0.6464646464646465, "grad_norm": 0.31046634912490845, "learning_rate": 0.00017647058823529413, "loss": 0.0845, "step": 16 }, { "epoch": 0.6868686868686869, "grad_norm": 0.31403785943984985, "learning_rate": 0.00017411764705882354, "loss": 0.0942, "step": 17 }, { "epoch": 0.7272727272727273, "grad_norm": 0.32949572801589966, "learning_rate": 0.00017176470588235293, "loss": 0.0976, "step": 18 }, { "epoch": 0.7676767676767676, "grad_norm": 0.8442594408988953, "learning_rate": 0.00016941176470588237, "loss": 0.1502, "step": 19 }, { "epoch": 0.8080808080808081, "grad_norm": 0.8818703889846802, "learning_rate": 0.00016705882352941178, "loss": 0.1008, "step": 20 }, { "epoch": 0.8484848484848485, "grad_norm": 1.1039226055145264, "learning_rate": 0.0001647058823529412, "loss": 0.0908, "step": 21 }, { "epoch": 0.8888888888888888, "grad_norm": 0.616620659828186, "learning_rate": 0.0001623529411764706, "loss": 0.1022, "step": 22 }, { "epoch": 0.9292929292929293, "grad_norm": 0.43923619389533997, "learning_rate": 0.00016, "loss": 0.1095, "step": 23 }, { "epoch": 0.9696969696969697, "grad_norm": 0.668854296207428, "learning_rate": 0.00015764705882352943, "loss": 0.0994, "step": 24 }, { "epoch": 1.0, "grad_norm": 0.54339200258255, "learning_rate": 0.00015529411764705884, "loss": 0.0822, "step": 25 }, { "epoch": 1.0404040404040404, "grad_norm": 0.52640700340271, "learning_rate": 0.00015294117647058822, "loss": 0.091, "step": 26 }, { "epoch": 1.0808080808080809, "grad_norm": 0.243753120303154, "learning_rate": 0.00015058823529411766, "loss": 0.0753, "step": 27 }, { "epoch": 1.121212121212121, "grad_norm": 0.16135047376155853, "learning_rate": 0.00014823529411764707, "loss": 0.0818, "step": 28 }, { "epoch": 1.1616161616161615, "grad_norm": 0.9692177772521973, "learning_rate": 0.00014588235294117646, "loss": 0.1394, "step": 29 }, { "epoch": 1.202020202020202, "grad_norm": 0.48012155294418335, "learning_rate": 0.0001435294117647059, "loss": 0.095, "step": 30 }, { "epoch": 1.2424242424242424, "grad_norm": 0.3694566786289215, "learning_rate": 0.0001411764705882353, "loss": 0.0776, "step": 31 }, { "epoch": 1.2828282828282829, "grad_norm": 0.604898989200592, "learning_rate": 0.00013882352941176472, "loss": 0.0727, "step": 32 }, { "epoch": 1.3232323232323233, "grad_norm": 0.6668853163719177, "learning_rate": 0.00013647058823529413, "loss": 0.1211, "step": 33 }, { "epoch": 1.3636363636363638, "grad_norm": 0.8030984401702881, "learning_rate": 0.00013411764705882352, "loss": 0.0724, "step": 34 }, { "epoch": 1.404040404040404, "grad_norm": 0.5926573872566223, "learning_rate": 0.00013176470588235296, "loss": 0.0671, "step": 35 }, { "epoch": 1.4444444444444444, "grad_norm": 0.20058207213878632, "learning_rate": 0.00012941176470588237, "loss": 0.0686, "step": 36 }, { "epoch": 1.4848484848484849, "grad_norm": 0.30539166927337646, "learning_rate": 0.00012705882352941175, "loss": 0.0968, "step": 37 }, { "epoch": 1.5252525252525253, "grad_norm": 0.6506590247154236, "learning_rate": 0.0001247058823529412, "loss": 0.0972, "step": 38 }, { "epoch": 1.5656565656565657, "grad_norm": 0.647463858127594, "learning_rate": 0.0001223529411764706, "loss": 0.0786, "step": 39 }, { "epoch": 1.606060606060606, "grad_norm": 0.4133020043373108, "learning_rate": 0.00012, "loss": 0.0985, "step": 40 }, { "epoch": 1.6464646464646466, "grad_norm": 0.798978328704834, "learning_rate": 0.00011764705882352942, "loss": 0.0993, "step": 41 }, { "epoch": 1.6868686868686869, "grad_norm": 0.438997358083725, "learning_rate": 0.00011529411764705881, "loss": 0.1002, "step": 42 }, { "epoch": 1.7272727272727273, "grad_norm": 0.2584928870201111, "learning_rate": 0.00011294117647058824, "loss": 0.0851, "step": 43 }, { "epoch": 1.7676767676767677, "grad_norm": 0.259726345539093, "learning_rate": 0.00011058823529411766, "loss": 0.0859, "step": 44 }, { "epoch": 1.808080808080808, "grad_norm": 0.44141435623168945, "learning_rate": 0.00010823529411764706, "loss": 0.1094, "step": 45 }, { "epoch": 1.8484848484848486, "grad_norm": 0.5731039047241211, "learning_rate": 0.00010588235294117647, "loss": 0.1231, "step": 46 }, { "epoch": 1.8888888888888888, "grad_norm": 0.3471589684486389, "learning_rate": 0.0001035294117647059, "loss": 0.0773, "step": 47 }, { "epoch": 1.9292929292929293, "grad_norm": 0.2618795335292816, "learning_rate": 0.0001011764705882353, "loss": 0.0832, "step": 48 }, { "epoch": 1.9696969696969697, "grad_norm": 0.4264814257621765, "learning_rate": 9.882352941176471e-05, "loss": 0.0925, "step": 49 }, { "epoch": 2.0, "grad_norm": 0.5760068297386169, "learning_rate": 9.647058823529412e-05, "loss": 0.0778, "step": 50 }, { "epoch": 2.04040404040404, "grad_norm": 0.22954879701137543, "learning_rate": 9.411764705882353e-05, "loss": 0.0733, "step": 51 }, { "epoch": 2.080808080808081, "grad_norm": 0.21470747888088226, "learning_rate": 9.176470588235295e-05, "loss": 0.0716, "step": 52 }, { "epoch": 2.121212121212121, "grad_norm": 0.2303597778081894, "learning_rate": 8.941176470588236e-05, "loss": 0.0738, "step": 53 }, { "epoch": 2.1616161616161618, "grad_norm": 0.2480212152004242, "learning_rate": 8.705882352941177e-05, "loss": 0.0791, "step": 54 }, { "epoch": 2.202020202020202, "grad_norm": 0.1986403614282608, "learning_rate": 8.470588235294118e-05, "loss": 0.0673, "step": 55 }, { "epoch": 2.242424242424242, "grad_norm": 0.2764434218406677, "learning_rate": 8.23529411764706e-05, "loss": 0.0764, "step": 56 }, { "epoch": 2.282828282828283, "grad_norm": 0.45056474208831787, "learning_rate": 8e-05, "loss": 0.0683, "step": 57 }, { "epoch": 2.323232323232323, "grad_norm": 0.37713348865509033, "learning_rate": 7.764705882352942e-05, "loss": 0.0785, "step": 58 }, { "epoch": 2.3636363636363638, "grad_norm": 0.19750048220157623, "learning_rate": 7.529411764705883e-05, "loss": 0.0719, "step": 59 }, { "epoch": 2.404040404040404, "grad_norm": 0.23382727801799774, "learning_rate": 7.294117647058823e-05, "loss": 0.0769, "step": 60 }, { "epoch": 2.4444444444444446, "grad_norm": 0.43519431352615356, "learning_rate": 7.058823529411765e-05, "loss": 0.0953, "step": 61 }, { "epoch": 2.484848484848485, "grad_norm": 0.8023049831390381, "learning_rate": 6.823529411764707e-05, "loss": 0.0978, "step": 62 }, { "epoch": 2.525252525252525, "grad_norm": 0.5448880195617676, "learning_rate": 6.588235294117648e-05, "loss": 0.0786, "step": 63 }, { "epoch": 2.5656565656565657, "grad_norm": 0.5319021940231323, "learning_rate": 6.352941176470588e-05, "loss": 0.0837, "step": 64 }, { "epoch": 2.606060606060606, "grad_norm": 0.3056259751319885, "learning_rate": 6.11764705882353e-05, "loss": 0.0716, "step": 65 }, { "epoch": 2.6464646464646466, "grad_norm": 0.3007633686065674, "learning_rate": 5.882352941176471e-05, "loss": 0.0781, "step": 66 }, { "epoch": 2.686868686868687, "grad_norm": 0.517301619052887, "learning_rate": 5.647058823529412e-05, "loss": 0.0751, "step": 67 }, { "epoch": 2.7272727272727275, "grad_norm": 0.31967368721961975, "learning_rate": 5.411764705882353e-05, "loss": 0.0948, "step": 68 }, { "epoch": 2.7676767676767677, "grad_norm": 0.22360506653785706, "learning_rate": 5.176470588235295e-05, "loss": 0.0721, "step": 69 }, { "epoch": 2.808080808080808, "grad_norm": 0.8932453393936157, "learning_rate": 4.9411764705882355e-05, "loss": 0.083, "step": 70 }, { "epoch": 2.8484848484848486, "grad_norm": 0.17888718843460083, "learning_rate": 4.705882352941177e-05, "loss": 0.0759, "step": 71 }, { "epoch": 2.888888888888889, "grad_norm": 0.2312222719192505, "learning_rate": 4.470588235294118e-05, "loss": 0.0819, "step": 72 }, { "epoch": 2.929292929292929, "grad_norm": 0.3377898335456848, "learning_rate": 4.235294117647059e-05, "loss": 0.091, "step": 73 }, { "epoch": 2.9696969696969697, "grad_norm": 0.22434180974960327, "learning_rate": 4e-05, "loss": 0.0656, "step": 74 }, { "epoch": 3.0, "grad_norm": 0.4803672432899475, "learning_rate": 3.7647058823529415e-05, "loss": 0.0654, "step": 75 }, { "epoch": 3.04040404040404, "grad_norm": 0.18344801664352417, "learning_rate": 3.529411764705883e-05, "loss": 0.0681, "step": 76 }, { "epoch": 3.080808080808081, "grad_norm": 0.18728883564472198, "learning_rate": 3.294117647058824e-05, "loss": 0.0641, "step": 77 }, { "epoch": 3.121212121212121, "grad_norm": 0.509119987487793, "learning_rate": 3.058823529411765e-05, "loss": 0.0777, "step": 78 }, { "epoch": 3.1616161616161618, "grad_norm": 0.16499896347522736, "learning_rate": 2.823529411764706e-05, "loss": 0.0578, "step": 79 }, { "epoch": 3.202020202020202, "grad_norm": 0.17131227254867554, "learning_rate": 2.5882352941176475e-05, "loss": 0.0597, "step": 80 }, { "epoch": 3.242424242424242, "grad_norm": 0.17663079500198364, "learning_rate": 2.3529411764705884e-05, "loss": 0.0674, "step": 81 }, { "epoch": 3.282828282828283, "grad_norm": 0.18466462194919586, "learning_rate": 2.1176470588235296e-05, "loss": 0.062, "step": 82 }, { "epoch": 3.323232323232323, "grad_norm": 0.1754070371389389, "learning_rate": 1.8823529411764708e-05, "loss": 0.0703, "step": 83 }, { "epoch": 3.3636363636363638, "grad_norm": 0.16022254526615143, "learning_rate": 1.647058823529412e-05, "loss": 0.0651, "step": 84 }, { "epoch": 3.404040404040404, "grad_norm": 0.16330307722091675, "learning_rate": 1.411764705882353e-05, "loss": 0.0566, "step": 85 }, { "epoch": 3.4444444444444446, "grad_norm": 0.38002651929855347, "learning_rate": 1.1764705882352942e-05, "loss": 0.0726, "step": 86 }, { "epoch": 3.484848484848485, "grad_norm": 0.17870256304740906, "learning_rate": 9.411764705882354e-06, "loss": 0.0658, "step": 87 }, { "epoch": 3.525252525252525, "grad_norm": 0.19073323905467987, "learning_rate": 7.058823529411765e-06, "loss": 0.0645, "step": 88 }, { "epoch": 3.5656565656565657, "grad_norm": 0.1769099086523056, "learning_rate": 4.705882352941177e-06, "loss": 0.0615, "step": 89 }, { "epoch": 3.606060606060606, "grad_norm": 0.2047484815120697, "learning_rate": 2.3529411764705885e-06, "loss": 0.0713, "step": 90 } ], "logging_steps": 1, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3309241116770304.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }