{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2482758620689656, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027586206896551724, "grad_norm": 0.8273730278015137, "learning_rate": 0.0, "loss": 2.5179, "step": 1 }, { "epoch": 0.05517241379310345, "grad_norm": 0.8761172890663147, "learning_rate": 4e-05, "loss": 2.8481, "step": 2 }, { "epoch": 0.08275862068965517, "grad_norm": 0.7226160764694214, "learning_rate": 8e-05, "loss": 2.6317, "step": 3 }, { "epoch": 0.1103448275862069, "grad_norm": 0.7074100375175476, "learning_rate": 0.00012, "loss": 2.691, "step": 4 }, { "epoch": 0.13793103448275862, "grad_norm": 0.8948147296905518, "learning_rate": 0.00016, "loss": 2.5618, "step": 5 }, { "epoch": 0.16551724137931034, "grad_norm": 0.6676309108734131, "learning_rate": 0.0002, "loss": 1.7508, "step": 6 }, { "epoch": 0.19310344827586207, "grad_norm": 0.7051960825920105, "learning_rate": 0.0001982608695652174, "loss": 2.1303, "step": 7 }, { "epoch": 0.2206896551724138, "grad_norm": 0.9678179025650024, "learning_rate": 0.0001965217391304348, "loss": 1.8536, "step": 8 }, { "epoch": 0.2482758620689655, "grad_norm": 0.9203357696533203, "learning_rate": 0.00019478260869565218, "loss": 2.055, "step": 9 }, { "epoch": 0.27586206896551724, "grad_norm": 0.8674907684326172, "learning_rate": 0.00019304347826086958, "loss": 2.1066, "step": 10 }, { "epoch": 0.30344827586206896, "grad_norm": 1.1215510368347168, "learning_rate": 0.00019130434782608697, "loss": 2.1572, "step": 11 }, { "epoch": 0.3310344827586207, "grad_norm": 0.9455694556236267, "learning_rate": 0.00018956521739130436, "loss": 1.9253, "step": 12 }, { "epoch": 0.3586206896551724, "grad_norm": 1.1137198209762573, "learning_rate": 0.00018782608695652175, "loss": 2.1465, "step": 13 }, { "epoch": 0.38620689655172413, "grad_norm": 1.0382691621780396, "learning_rate": 0.00018608695652173914, "loss": 1.8506, "step": 14 }, { "epoch": 0.41379310344827586, "grad_norm": 0.827080249786377, "learning_rate": 0.00018434782608695653, "loss": 1.3841, "step": 15 }, { "epoch": 0.4413793103448276, "grad_norm": 0.9159547686576843, "learning_rate": 0.00018260869565217392, "loss": 1.8481, "step": 16 }, { "epoch": 0.4689655172413793, "grad_norm": 0.8442085385322571, "learning_rate": 0.00018086956521739132, "loss": 1.7424, "step": 17 }, { "epoch": 0.496551724137931, "grad_norm": 0.864258348941803, "learning_rate": 0.0001791304347826087, "loss": 1.6131, "step": 18 }, { "epoch": 0.5241379310344828, "grad_norm": 1.0820664167404175, "learning_rate": 0.0001773913043478261, "loss": 1.6129, "step": 19 }, { "epoch": 0.5517241379310345, "grad_norm": 0.9673957824707031, "learning_rate": 0.0001756521739130435, "loss": 1.5257, "step": 20 }, { "epoch": 0.5793103448275863, "grad_norm": 0.9112041592597961, "learning_rate": 0.00017391304347826088, "loss": 1.2875, "step": 21 }, { "epoch": 0.6068965517241379, "grad_norm": 0.9094520211219788, "learning_rate": 0.00017217391304347827, "loss": 1.3703, "step": 22 }, { "epoch": 0.6344827586206897, "grad_norm": 0.9438947439193726, "learning_rate": 0.00017043478260869566, "loss": 1.3091, "step": 23 }, { "epoch": 0.6620689655172414, "grad_norm": 1.3191273212432861, "learning_rate": 0.00016869565217391306, "loss": 1.7087, "step": 24 }, { "epoch": 0.6896551724137931, "grad_norm": 0.9469236135482788, "learning_rate": 0.00016695652173913042, "loss": 1.4919, "step": 25 }, { "epoch": 0.7172413793103448, "grad_norm": 1.0983434915542603, "learning_rate": 0.00016521739130434784, "loss": 1.464, "step": 26 }, { "epoch": 0.7448275862068966, "grad_norm": 0.9698247313499451, "learning_rate": 0.00016347826086956523, "loss": 1.4398, "step": 27 }, { "epoch": 0.7724137931034483, "grad_norm": 0.8902468085289001, "learning_rate": 0.00016173913043478262, "loss": 1.3191, "step": 28 }, { "epoch": 0.8, "grad_norm": 0.8863650560379028, "learning_rate": 0.00016, "loss": 1.2048, "step": 29 }, { "epoch": 0.8275862068965517, "grad_norm": 1.0257900953292847, "learning_rate": 0.0001582608695652174, "loss": 1.2085, "step": 30 }, { "epoch": 0.8551724137931035, "grad_norm": 0.9826428294181824, "learning_rate": 0.0001565217391304348, "loss": 1.1733, "step": 31 }, { "epoch": 0.8827586206896552, "grad_norm": 0.9123853445053101, "learning_rate": 0.0001547826086956522, "loss": 1.4226, "step": 32 }, { "epoch": 0.9103448275862069, "grad_norm": 0.8653205633163452, "learning_rate": 0.00015304347826086958, "loss": 1.3998, "step": 33 }, { "epoch": 0.9379310344827586, "grad_norm": 1.2209527492523193, "learning_rate": 0.00015130434782608694, "loss": 1.2225, "step": 34 }, { "epoch": 0.9655172413793104, "grad_norm": 0.977463960647583, "learning_rate": 0.00014956521739130436, "loss": 1.186, "step": 35 }, { "epoch": 0.993103448275862, "grad_norm": 0.8854506611824036, "learning_rate": 0.00014782608695652173, "loss": 0.9478, "step": 36 }, { "epoch": 1.0, "grad_norm": 1.660280704498291, "learning_rate": 0.00014608695652173914, "loss": 0.7476, "step": 37 }, { "epoch": 1.0275862068965518, "grad_norm": 0.9172261953353882, "learning_rate": 0.00014434782608695654, "loss": 0.959, "step": 38 }, { "epoch": 1.0551724137931036, "grad_norm": 0.9950329661369324, "learning_rate": 0.00014260869565217393, "loss": 1.087, "step": 39 }, { "epoch": 1.0827586206896551, "grad_norm": 0.9052255749702454, "learning_rate": 0.00014086956521739132, "loss": 1.0335, "step": 40 }, { "epoch": 1.110344827586207, "grad_norm": 0.8859487771987915, "learning_rate": 0.0001391304347826087, "loss": 1.0489, "step": 41 }, { "epoch": 1.1379310344827587, "grad_norm": 0.9165846705436707, "learning_rate": 0.0001373913043478261, "loss": 1.0135, "step": 42 }, { "epoch": 1.1655172413793102, "grad_norm": 1.2192325592041016, "learning_rate": 0.00013565217391304347, "loss": 1.1084, "step": 43 }, { "epoch": 1.193103448275862, "grad_norm": 1.2101364135742188, "learning_rate": 0.00013391304347826088, "loss": 1.1635, "step": 44 }, { "epoch": 1.2206896551724138, "grad_norm": 1.099292516708374, "learning_rate": 0.00013217391304347825, "loss": 1.1804, "step": 45 }, { "epoch": 1.2482758620689656, "grad_norm": 0.9990763068199158, "learning_rate": 0.00013043478260869567, "loss": 0.8802, "step": 46 }, { "epoch": 1.2758620689655173, "grad_norm": 0.9451124668121338, "learning_rate": 0.00012869565217391303, "loss": 0.9852, "step": 47 }, { "epoch": 1.303448275862069, "grad_norm": 0.96523118019104, "learning_rate": 0.00012695652173913045, "loss": 1.0243, "step": 48 }, { "epoch": 1.3310344827586207, "grad_norm": 1.0256421566009521, "learning_rate": 0.00012521739130434784, "loss": 0.8908, "step": 49 }, { "epoch": 1.3586206896551725, "grad_norm": 1.0647811889648438, "learning_rate": 0.00012347826086956523, "loss": 0.7224, "step": 50 }, { "epoch": 1.386206896551724, "grad_norm": 1.042438268661499, "learning_rate": 0.00012173913043478263, "loss": 0.5707, "step": 51 }, { "epoch": 1.4137931034482758, "grad_norm": 1.0345197916030884, "learning_rate": 0.00012, "loss": 0.7433, "step": 52 }, { "epoch": 1.4413793103448276, "grad_norm": 1.0194092988967896, "learning_rate": 0.00011826086956521741, "loss": 0.8686, "step": 53 }, { "epoch": 1.4689655172413794, "grad_norm": 1.0849523544311523, "learning_rate": 0.00011652173913043479, "loss": 0.9063, "step": 54 }, { "epoch": 1.4965517241379311, "grad_norm": 1.3685775995254517, "learning_rate": 0.00011478260869565218, "loss": 0.8633, "step": 55 }, { "epoch": 1.524137931034483, "grad_norm": 1.2180424928665161, "learning_rate": 0.00011304347826086956, "loss": 0.8131, "step": 56 }, { "epoch": 1.5517241379310345, "grad_norm": 1.027662992477417, "learning_rate": 0.00011130434782608696, "loss": 0.8072, "step": 57 }, { "epoch": 1.5793103448275863, "grad_norm": 1.0541893243789673, "learning_rate": 0.00010956521739130434, "loss": 0.5513, "step": 58 }, { "epoch": 1.6068965517241378, "grad_norm": 0.9840919375419617, "learning_rate": 0.00010782608695652174, "loss": 0.6515, "step": 59 }, { "epoch": 1.6344827586206896, "grad_norm": 1.1880444288253784, "learning_rate": 0.00010608695652173915, "loss": 0.9452, "step": 60 }, { "epoch": 1.6620689655172414, "grad_norm": 1.1577789783477783, "learning_rate": 0.00010434782608695653, "loss": 0.5324, "step": 61 }, { "epoch": 1.6896551724137931, "grad_norm": 1.4066375494003296, "learning_rate": 0.00010260869565217393, "loss": 0.7674, "step": 62 }, { "epoch": 1.717241379310345, "grad_norm": 1.5101147890090942, "learning_rate": 0.00010086956521739131, "loss": 0.7358, "step": 63 }, { "epoch": 1.7448275862068967, "grad_norm": 1.2288732528686523, "learning_rate": 9.91304347826087e-05, "loss": 0.762, "step": 64 }, { "epoch": 1.7724137931034483, "grad_norm": 1.1810815334320068, "learning_rate": 9.739130434782609e-05, "loss": 0.824, "step": 65 }, { "epoch": 1.8, "grad_norm": 1.0823071002960205, "learning_rate": 9.565217391304348e-05, "loss": 0.683, "step": 66 }, { "epoch": 1.8275862068965516, "grad_norm": 1.1553919315338135, "learning_rate": 9.391304347826087e-05, "loss": 0.737, "step": 67 }, { "epoch": 1.8551724137931034, "grad_norm": 1.3099501132965088, "learning_rate": 9.217391304347827e-05, "loss": 0.4993, "step": 68 }, { "epoch": 1.8827586206896552, "grad_norm": 1.3969764709472656, "learning_rate": 9.043478260869566e-05, "loss": 0.6857, "step": 69 }, { "epoch": 1.910344827586207, "grad_norm": 1.2558094263076782, "learning_rate": 8.869565217391305e-05, "loss": 0.5347, "step": 70 }, { "epoch": 1.9379310344827587, "grad_norm": 1.2341969013214111, "learning_rate": 8.695652173913044e-05, "loss": 0.6651, "step": 71 }, { "epoch": 1.9655172413793105, "grad_norm": 1.2917416095733643, "learning_rate": 8.521739130434783e-05, "loss": 0.6177, "step": 72 }, { "epoch": 1.993103448275862, "grad_norm": 1.2867687940597534, "learning_rate": 8.347826086956521e-05, "loss": 0.6304, "step": 73 }, { "epoch": 2.0, "grad_norm": 2.8276941776275635, "learning_rate": 8.173913043478262e-05, "loss": 0.829, "step": 74 }, { "epoch": 2.027586206896552, "grad_norm": 1.1930606365203857, "learning_rate": 8e-05, "loss": 0.3571, "step": 75 }, { "epoch": 2.0551724137931036, "grad_norm": 1.364702820777893, "learning_rate": 7.82608695652174e-05, "loss": 0.4993, "step": 76 }, { "epoch": 2.0827586206896553, "grad_norm": 1.2684059143066406, "learning_rate": 7.652173913043479e-05, "loss": 0.4305, "step": 77 }, { "epoch": 2.110344827586207, "grad_norm": 1.1678532361984253, "learning_rate": 7.478260869565218e-05, "loss": 0.4332, "step": 78 }, { "epoch": 2.1379310344827585, "grad_norm": 1.3142938613891602, "learning_rate": 7.304347826086957e-05, "loss": 0.4618, "step": 79 }, { "epoch": 2.1655172413793102, "grad_norm": 1.359118938446045, "learning_rate": 7.130434782608696e-05, "loss": 0.5795, "step": 80 }, { "epoch": 2.193103448275862, "grad_norm": 1.6325678825378418, "learning_rate": 6.956521739130436e-05, "loss": 0.4021, "step": 81 }, { "epoch": 2.220689655172414, "grad_norm": 1.2178771495819092, "learning_rate": 6.782608695652173e-05, "loss": 0.4262, "step": 82 }, { "epoch": 2.2482758620689656, "grad_norm": 1.0997027158737183, "learning_rate": 6.608695652173912e-05, "loss": 0.3029, "step": 83 }, { "epoch": 2.2758620689655173, "grad_norm": 1.1487294435501099, "learning_rate": 6.434782608695652e-05, "loss": 0.3831, "step": 84 }, { "epoch": 2.303448275862069, "grad_norm": 1.3247836828231812, "learning_rate": 6.260869565217392e-05, "loss": 0.4692, "step": 85 }, { "epoch": 2.3310344827586205, "grad_norm": 1.1617599725723267, "learning_rate": 6.086956521739131e-05, "loss": 0.3617, "step": 86 }, { "epoch": 2.3586206896551722, "grad_norm": 1.2517313957214355, "learning_rate": 5.9130434782608704e-05, "loss": 0.2729, "step": 87 }, { "epoch": 2.386206896551724, "grad_norm": 1.1272892951965332, "learning_rate": 5.739130434782609e-05, "loss": 0.3707, "step": 88 }, { "epoch": 2.413793103448276, "grad_norm": 1.196664571762085, "learning_rate": 5.565217391304348e-05, "loss": 0.4032, "step": 89 }, { "epoch": 2.4413793103448276, "grad_norm": 1.4257408380508423, "learning_rate": 5.391304347826087e-05, "loss": 0.3194, "step": 90 }, { "epoch": 2.4689655172413794, "grad_norm": 1.798063039779663, "learning_rate": 5.217391304347826e-05, "loss": 0.5721, "step": 91 }, { "epoch": 2.496551724137931, "grad_norm": 1.449183702468872, "learning_rate": 5.0434782608695655e-05, "loss": 0.3841, "step": 92 }, { "epoch": 2.524137931034483, "grad_norm": 1.4660217761993408, "learning_rate": 4.8695652173913046e-05, "loss": 0.4134, "step": 93 }, { "epoch": 2.5517241379310347, "grad_norm": 1.3259236812591553, "learning_rate": 4.695652173913044e-05, "loss": 0.5053, "step": 94 }, { "epoch": 2.5793103448275865, "grad_norm": 1.1987637281417847, "learning_rate": 4.521739130434783e-05, "loss": 0.3203, "step": 95 }, { "epoch": 2.606896551724138, "grad_norm": 1.702609896659851, "learning_rate": 4.347826086956522e-05, "loss": 0.5448, "step": 96 }, { "epoch": 2.6344827586206896, "grad_norm": 1.2012200355529785, "learning_rate": 4.1739130434782605e-05, "loss": 0.4839, "step": 97 }, { "epoch": 2.6620689655172414, "grad_norm": 1.1377613544464111, "learning_rate": 4e-05, "loss": 0.3131, "step": 98 }, { "epoch": 2.689655172413793, "grad_norm": 1.377774953842163, "learning_rate": 3.8260869565217395e-05, "loss": 0.3463, "step": 99 }, { "epoch": 2.717241379310345, "grad_norm": 1.1738471984863281, "learning_rate": 3.6521739130434786e-05, "loss": 0.2963, "step": 100 }, { "epoch": 2.7448275862068967, "grad_norm": 1.1475613117218018, "learning_rate": 3.478260869565218e-05, "loss": 0.2953, "step": 101 }, { "epoch": 2.772413793103448, "grad_norm": 1.5838022232055664, "learning_rate": 3.304347826086956e-05, "loss": 0.3852, "step": 102 }, { "epoch": 2.8, "grad_norm": 1.6446831226348877, "learning_rate": 3.130434782608696e-05, "loss": 0.5384, "step": 103 }, { "epoch": 2.8275862068965516, "grad_norm": 1.4402813911437988, "learning_rate": 2.9565217391304352e-05, "loss": 0.3184, "step": 104 }, { "epoch": 2.8551724137931034, "grad_norm": 1.3366456031799316, "learning_rate": 2.782608695652174e-05, "loss": 0.4545, "step": 105 }, { "epoch": 2.882758620689655, "grad_norm": 1.4988086223602295, "learning_rate": 2.608695652173913e-05, "loss": 0.2341, "step": 106 }, { "epoch": 2.910344827586207, "grad_norm": 1.35313880443573, "learning_rate": 2.4347826086956523e-05, "loss": 0.3555, "step": 107 }, { "epoch": 2.9379310344827587, "grad_norm": 1.1439647674560547, "learning_rate": 2.2608695652173914e-05, "loss": 0.299, "step": 108 }, { "epoch": 2.9655172413793105, "grad_norm": 1.2948118448257446, "learning_rate": 2.0869565217391303e-05, "loss": 0.4421, "step": 109 }, { "epoch": 2.9931034482758623, "grad_norm": 1.283116340637207, "learning_rate": 1.9130434782608697e-05, "loss": 0.368, "step": 110 }, { "epoch": 3.0, "grad_norm": 3.516788959503174, "learning_rate": 1.739130434782609e-05, "loss": 0.3088, "step": 111 }, { "epoch": 3.027586206896552, "grad_norm": 1.1082452535629272, "learning_rate": 1.565217391304348e-05, "loss": 0.2828, "step": 112 }, { "epoch": 3.0551724137931036, "grad_norm": 1.109584927558899, "learning_rate": 1.391304347826087e-05, "loss": 0.2485, "step": 113 }, { "epoch": 3.0827586206896553, "grad_norm": 1.0583851337432861, "learning_rate": 1.2173913043478261e-05, "loss": 0.2712, "step": 114 }, { "epoch": 3.110344827586207, "grad_norm": 1.1029939651489258, "learning_rate": 1.0434782608695651e-05, "loss": 0.3537, "step": 115 }, { "epoch": 3.1379310344827585, "grad_norm": 1.0736896991729736, "learning_rate": 8.695652173913044e-06, "loss": 0.2121, "step": 116 }, { "epoch": 3.1655172413793102, "grad_norm": 1.1432276964187622, "learning_rate": 6.956521739130435e-06, "loss": 0.2371, "step": 117 }, { "epoch": 3.193103448275862, "grad_norm": 1.4690179824829102, "learning_rate": 5.217391304347826e-06, "loss": 0.3357, "step": 118 }, { "epoch": 3.220689655172414, "grad_norm": 1.240258812904358, "learning_rate": 3.4782608695652175e-06, "loss": 0.2727, "step": 119 }, { "epoch": 3.2482758620689656, "grad_norm": 1.2821192741394043, "learning_rate": 1.7391304347826088e-06, "loss": 0.3032, "step": 120 } ], "logging_steps": 1, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.068699077260083e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }