| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.988610478359909, | |
| "eval_steps": 500, | |
| "global_step": 657, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04555808656036447, | |
| "grad_norm": 16.628382336058554, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 1.5883, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09111617312072894, | |
| "grad_norm": 3.221028223725061, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 0.9317, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1366742596810934, | |
| "grad_norm": 1.5103642702449496, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.5336, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.18223234624145787, | |
| "grad_norm": 1.0681798496560502, | |
| "learning_rate": 6.060606060606061e-06, | |
| "loss": 0.4417, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22779043280182232, | |
| "grad_norm": 0.9679921213671909, | |
| "learning_rate": 7.5757575757575764e-06, | |
| "loss": 0.3979, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2733485193621868, | |
| "grad_norm": 0.926628742270625, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.3744, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.31890660592255127, | |
| "grad_norm": 0.9491261260284587, | |
| "learning_rate": 9.998869765883566e-06, | |
| "loss": 0.3486, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.36446469248291574, | |
| "grad_norm": 0.84920311370356, | |
| "learning_rate": 9.986160499534318e-06, | |
| "loss": 0.3453, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.41002277904328016, | |
| "grad_norm": 0.9789183018412179, | |
| "learning_rate": 9.959365197965824e-06, | |
| "loss": 0.3572, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.45558086560364464, | |
| "grad_norm": 0.9102455500916922, | |
| "learning_rate": 9.918559558613344e-06, | |
| "loss": 0.3356, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5011389521640092, | |
| "grad_norm": 0.9258850899143908, | |
| "learning_rate": 9.863858858486736e-06, | |
| "loss": 0.3278, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5466970387243736, | |
| "grad_norm": 0.8922762696366542, | |
| "learning_rate": 9.795417628509857e-06, | |
| "loss": 0.3268, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.592255125284738, | |
| "grad_norm": 0.9396732926347776, | |
| "learning_rate": 9.713429216966624e-06, | |
| "loss": 0.3108, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6378132118451025, | |
| "grad_norm": 0.845601467481683, | |
| "learning_rate": 9.618125243286989e-06, | |
| "loss": 0.3164, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.683371298405467, | |
| "grad_norm": 0.8159514975969553, | |
| "learning_rate": 9.50977494371594e-06, | |
| "loss": 0.3127, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7289293849658315, | |
| "grad_norm": 0.8778654186332652, | |
| "learning_rate": 9.388684410713977e-06, | |
| "loss": 0.3118, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7744874715261959, | |
| "grad_norm": 0.8158041504461108, | |
| "learning_rate": 9.255195728237837e-06, | |
| "loss": 0.3033, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8200455580865603, | |
| "grad_norm": 0.8241477933179293, | |
| "learning_rate": 9.109686005344258e-06, | |
| "loss": 0.298, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8656036446469249, | |
| "grad_norm": 0.7836442953404367, | |
| "learning_rate": 8.952566310846931e-06, | |
| "loss": 0.2967, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.9111617312072893, | |
| "grad_norm": 0.8589198296868927, | |
| "learning_rate": 8.784280512036235e-06, | |
| "loss": 0.2908, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9567198177676538, | |
| "grad_norm": 0.8976108880329798, | |
| "learning_rate": 8.60530402074241e-06, | |
| "loss": 0.288, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8244261812910136, | |
| "learning_rate": 8.416142450284565e-06, | |
| "loss": 0.274, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.0455580865603644, | |
| "grad_norm": 0.7709445009431932, | |
| "learning_rate": 8.217330187099689e-06, | |
| "loss": 0.178, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0911161731207288, | |
| "grad_norm": 0.7371669649380324, | |
| "learning_rate": 8.009428881086836e-06, | |
| "loss": 0.1706, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.1366742596810935, | |
| "grad_norm": 0.8325359027964357, | |
| "learning_rate": 7.793025858931317e-06, | |
| "loss": 0.1717, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.182232346241458, | |
| "grad_norm": 0.7810927662905283, | |
| "learning_rate": 7.568732464891293e-06, | |
| "loss": 0.1768, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.2277904328018223, | |
| "grad_norm": 0.7328567227225008, | |
| "learning_rate": 7.33718233373407e-06, | |
| "loss": 0.174, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2733485193621867, | |
| "grad_norm": 0.829615067889057, | |
| "learning_rate": 7.099029600701144e-06, | |
| "loss": 0.1721, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.3189066059225514, | |
| "grad_norm": 0.7764413190939856, | |
| "learning_rate": 6.854947053558849e-06, | |
| "loss": 0.169, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3644646924829158, | |
| "grad_norm": 0.7592152186492857, | |
| "learning_rate": 6.6056242319551315e-06, | |
| "loss": 0.1667, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.4100227790432802, | |
| "grad_norm": 0.7583979219386041, | |
| "learning_rate": 6.3517654794518156e-06, | |
| "loss": 0.1644, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.4555808656036446, | |
| "grad_norm": 0.7329938588396943, | |
| "learning_rate": 6.094087953735423e-06, | |
| "loss": 0.168, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.501138952164009, | |
| "grad_norm": 0.8196364714199265, | |
| "learning_rate": 5.8333196006277536e-06, | |
| "loss": 0.1617, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.5466970387243735, | |
| "grad_norm": 0.7565763296431312, | |
| "learning_rate": 5.570197097619688e-06, | |
| "loss": 0.1611, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.592255125284738, | |
| "grad_norm": 0.8868392512031471, | |
| "learning_rate": 5.305463772737812e-06, | |
| "loss": 0.1609, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.6378132118451025, | |
| "grad_norm": 0.8812463705246544, | |
| "learning_rate": 5.039867504623084e-06, | |
| "loss": 0.1677, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.683371298405467, | |
| "grad_norm": 0.7633777742059934, | |
| "learning_rate": 4.774158609753908e-06, | |
| "loss": 0.1512, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.7289293849658316, | |
| "grad_norm": 0.7820693250794272, | |
| "learning_rate": 4.5090877227822424e-06, | |
| "loss": 0.1572, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.774487471526196, | |
| "grad_norm": 0.7688524374447184, | |
| "learning_rate": 4.245403675970877e-06, | |
| "loss": 0.1617, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.8200455580865604, | |
| "grad_norm": 0.7254423021253176, | |
| "learning_rate": 3.9838513837224814e-06, | |
| "loss": 0.1519, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.8656036446469249, | |
| "grad_norm": 0.8076268890579671, | |
| "learning_rate": 3.7251697381767373e-06, | |
| "loss": 0.154, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.9111617312072893, | |
| "grad_norm": 0.7615743261772087, | |
| "learning_rate": 3.4700895218205026e-06, | |
| "loss": 0.1468, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.9567198177676537, | |
| "grad_norm": 0.7511849885826963, | |
| "learning_rate": 3.2193313430079737e-06, | |
| "loss": 0.147, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.7584213495047252, | |
| "learning_rate": 2.9736036002230332e-06, | |
| "loss": 0.1429, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.0455580865603644, | |
| "grad_norm": 0.6815225387343684, | |
| "learning_rate": 2.7336004808348094e-06, | |
| "loss": 0.0751, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.091116173120729, | |
| "grad_norm": 0.6174750652979677, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.0711, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.1366742596810933, | |
| "grad_norm": 0.6236626143150734, | |
| "learning_rate": 2.273462085252146e-06, | |
| "loss": 0.0722, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.1822323462414577, | |
| "grad_norm": 0.5971939289865311, | |
| "learning_rate": 2.0546267121888863e-06, | |
| "loss": 0.0693, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.2277904328018225, | |
| "grad_norm": 0.631697145877715, | |
| "learning_rate": 1.8441120965239912e-06, | |
| "loss": 0.0713, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.273348519362187, | |
| "grad_norm": 0.6817300872785331, | |
| "learning_rate": 1.642512947611622e-06, | |
| "loss": 0.0729, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.3189066059225514, | |
| "grad_norm": 0.5682490177071967, | |
| "learning_rate": 1.4503987883766857e-06, | |
| "loss": 0.0656, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.364464692482916, | |
| "grad_norm": 0.5912819077013511, | |
| "learning_rate": 1.2683123463975144e-06, | |
| "loss": 0.0658, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.41002277904328, | |
| "grad_norm": 0.606352905873965, | |
| "learning_rate": 1.0967680206861198e-06, | |
| "loss": 0.0694, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.4555808656036446, | |
| "grad_norm": 0.6353412192229968, | |
| "learning_rate": 9.362504284973683e-07, | |
| "loss": 0.0701, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.501138952164009, | |
| "grad_norm": 0.6607014617898831, | |
| "learning_rate": 7.872130362724422e-07, | |
| "loss": 0.0691, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.5466970387243735, | |
| "grad_norm": 0.6527059303311876, | |
| "learning_rate": 6.500768785841482e-07, | |
| "loss": 0.0647, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.592255125284738, | |
| "grad_norm": 0.6203144834959747, | |
| "learning_rate": 5.252293687031196e-07, | |
| "loss": 0.0669, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.6378132118451028, | |
| "grad_norm": 0.6069382959627451, | |
| "learning_rate": 4.130232041450866e-07, | |
| "loss": 0.0624, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.6833712984054667, | |
| "grad_norm": 0.6304879856190249, | |
| "learning_rate": 3.1377537029107174e-07, | |
| "loss": 0.0655, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.7289293849658316, | |
| "grad_norm": 0.6325528536254185, | |
| "learning_rate": 2.2776624489530664e-07, | |
| "loss": 0.0667, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.774487471526196, | |
| "grad_norm": 0.6179538898003908, | |
| "learning_rate": 1.55238806010668e-07, | |
| "loss": 0.0644, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.8200455580865604, | |
| "grad_norm": 0.5965673740089573, | |
| "learning_rate": 9.639794556925041e-08, | |
| "loss": 0.0659, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.865603644646925, | |
| "grad_norm": 0.6382075048871372, | |
| "learning_rate": 5.1409890557246876e-08, | |
| "loss": 0.0645, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.9111617312072893, | |
| "grad_norm": 0.612194769673645, | |
| "learning_rate": 2.0401733419315727e-08, | |
| "loss": 0.063, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.9567198177676537, | |
| "grad_norm": 0.6571824905728761, | |
| "learning_rate": 3.4610730190648423e-09, | |
| "loss": 0.0647, | |
| "step": 650 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 657, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 418097876697088.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |