{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.71111111111111, "eval_steps": 500, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2962962962962963, "grad_norm": 4.49782157269768, "learning_rate": 9e-06, "loss": 0.8906, "step": 10 }, { "epoch": 0.5925925925925926, "grad_norm": 1.7319757135180889, "learning_rate": 9.71875e-06, "loss": 0.3413, "step": 20 }, { "epoch": 0.8888888888888888, "grad_norm": 1.3535304038871057, "learning_rate": 9.406250000000002e-06, "loss": 0.2847, "step": 30 }, { "epoch": 1.1777777777777778, "grad_norm": 1.1753491938410972, "learning_rate": 9.09375e-06, "loss": 0.2435, "step": 40 }, { "epoch": 1.474074074074074, "grad_norm": 1.3529820931928744, "learning_rate": 8.781250000000002e-06, "loss": 0.2002, "step": 50 }, { "epoch": 1.7703703703703704, "grad_norm": 1.2055625223119095, "learning_rate": 8.468750000000001e-06, "loss": 0.1946, "step": 60 }, { "epoch": 2.0592592592592593, "grad_norm": 0.9694081925518563, "learning_rate": 8.156250000000002e-06, "loss": 0.1785, "step": 70 }, { "epoch": 2.3555555555555556, "grad_norm": 1.2118570235556574, "learning_rate": 7.843750000000001e-06, "loss": 0.1136, "step": 80 }, { "epoch": 2.651851851851852, "grad_norm": 1.138977678286847, "learning_rate": 7.531250000000001e-06, "loss": 0.1115, "step": 90 }, { "epoch": 2.948148148148148, "grad_norm": 1.0036786958910326, "learning_rate": 7.218750000000001e-06, "loss": 0.1126, "step": 100 }, { "epoch": 3.237037037037037, "grad_norm": 0.8367560642733725, "learning_rate": 6.906250000000001e-06, "loss": 0.0728, "step": 110 }, { "epoch": 3.533333333333333, "grad_norm": 1.0509440799673675, "learning_rate": 6.593750000000001e-06, "loss": 0.0617, "step": 120 }, { "epoch": 3.8296296296296295, "grad_norm": 0.8960911427806355, "learning_rate": 6.281250000000001e-06, "loss": 0.0649, "step": 130 }, { "epoch": 4.118518518518519, "grad_norm": 0.7809646327713474, "learning_rate": 5.968750000000001e-06, "loss": 0.0526, "step": 140 }, { "epoch": 4.4148148148148145, "grad_norm": 0.8549249368670622, "learning_rate": 5.656250000000001e-06, "loss": 0.0335, "step": 150 }, { "epoch": 4.711111111111111, "grad_norm": 0.6945788960835679, "learning_rate": 5.343750000000001e-06, "loss": 0.0349, "step": 160 }, { "epoch": 5.0, "grad_norm": 0.7841073342004105, "learning_rate": 5.031250000000001e-06, "loss": 0.0347, "step": 170 }, { "epoch": 5.296296296296296, "grad_norm": 0.6556195586614093, "learning_rate": 4.71875e-06, "loss": 0.0167, "step": 180 }, { "epoch": 5.592592592592593, "grad_norm": 0.6075821111027782, "learning_rate": 4.40625e-06, "loss": 0.0174, "step": 190 }, { "epoch": 5.888888888888889, "grad_norm": 0.6111587561787415, "learning_rate": 4.09375e-06, "loss": 0.0173, "step": 200 }, { "epoch": 6.177777777777778, "grad_norm": 0.40463595396228347, "learning_rate": 3.78125e-06, "loss": 0.0117, "step": 210 }, { "epoch": 6.474074074074074, "grad_norm": 0.39652000374030977, "learning_rate": 3.46875e-06, "loss": 0.0079, "step": 220 }, { "epoch": 6.770370370370371, "grad_norm": 0.4927512843479825, "learning_rate": 3.15625e-06, "loss": 0.0082, "step": 230 }, { "epoch": 7.059259259259259, "grad_norm": 0.36605662616703716, "learning_rate": 2.84375e-06, "loss": 0.0073, "step": 240 }, { "epoch": 7.355555555555555, "grad_norm": 0.2483553830101468, "learning_rate": 2.53125e-06, "loss": 0.0038, "step": 250 }, { "epoch": 7.651851851851852, "grad_norm": 0.28219727023639773, "learning_rate": 2.21875e-06, "loss": 0.0037, "step": 260 }, { "epoch": 7.948148148148148, "grad_norm": 0.4959724939070521, "learning_rate": 1.90625e-06, "loss": 0.0032, "step": 270 }, { "epoch": 8.237037037037037, "grad_norm": 0.29779702369903077, "learning_rate": 1.59375e-06, "loss": 0.0018, "step": 280 }, { "epoch": 8.533333333333333, "grad_norm": 0.20927856889654697, "learning_rate": 1.28125e-06, "loss": 0.0015, "step": 290 }, { "epoch": 8.829629629629629, "grad_norm": 0.09637630938977507, "learning_rate": 9.6875e-07, "loss": 0.0014, "step": 300 }, { "epoch": 9.118518518518519, "grad_norm": 0.060667953625655406, "learning_rate": 6.562500000000001e-07, "loss": 0.0011, "step": 310 }, { "epoch": 9.414814814814815, "grad_norm": 0.061392256534022145, "learning_rate": 3.4375000000000004e-07, "loss": 0.0007, "step": 320 }, { "epoch": 9.71111111111111, "grad_norm": 0.09163877069911756, "learning_rate": 3.1250000000000005e-08, "loss": 0.0008, "step": 330 } ], "logging_steps": 10, "max_steps": 330, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 186149046386688.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }