diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,39734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 220, + "global_step": 4390, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002280501710376283, + "grad_norm": 44.5, + "learning_rate": 0.0, + "loss": 2.4621, + "mean_token_accuracy": 0.659612387418747, + "num_tokens": 107751.0, + "step": 1 + }, + { + "epoch": 0.004561003420752566, + "grad_norm": 42.25, + "learning_rate": 2.272727272727273e-08, + "loss": 2.5474, + "mean_token_accuracy": 0.6517423838376999, + "num_tokens": 214469.0, + "step": 2 + }, + { + "epoch": 0.0068415051311288486, + "grad_norm": 48.75, + "learning_rate": 4.545454545454546e-08, + "loss": 2.5963, + "mean_token_accuracy": 0.6451424211263657, + "num_tokens": 321493.0, + "step": 3 + }, + { + "epoch": 0.009122006841505131, + "grad_norm": 52.75, + "learning_rate": 6.818181818181819e-08, + "loss": 2.5346, + "mean_token_accuracy": 0.656927764415741, + "num_tokens": 428374.0, + "step": 4 + }, + { + "epoch": 0.011402508551881414, + "grad_norm": 33.75, + "learning_rate": 9.090909090909091e-08, + "loss": 2.5273, + "mean_token_accuracy": 0.6588353216648102, + "num_tokens": 535756.0, + "step": 5 + }, + { + "epoch": 0.013683010262257697, + "grad_norm": 32.0, + "learning_rate": 1.1363636363636364e-07, + "loss": 2.4468, + "mean_token_accuracy": 0.6671261787414551, + "num_tokens": 642560.0, + "step": 6 + }, + { + "epoch": 0.01596351197263398, + "grad_norm": 33.5, + "learning_rate": 1.3636363636363637e-07, + "loss": 2.5426, + "mean_token_accuracy": 0.6598037928342819, + "num_tokens": 749804.0, + "step": 7 + }, + { + "epoch": 0.018244013683010263, + "grad_norm": 33.25, + "learning_rate": 1.590909090909091e-07, + "loss": 2.5504, + "mean_token_accuracy": 0.6524623930454254, + "num_tokens": 857462.0, + "step": 8 + }, + { + "epoch": 0.020524515393386546, + "grad_norm": 39.25, + "learning_rate": 1.8181818181818183e-07, + "loss": 2.4795, + "mean_token_accuracy": 0.6614948958158493, + "num_tokens": 964652.0, + "step": 9 + }, + { + "epoch": 0.02280501710376283, + "grad_norm": 32.25, + "learning_rate": 2.0454545454545456e-07, + "loss": 2.4681, + "mean_token_accuracy": 0.6571934372186661, + "num_tokens": 1072114.0, + "step": 10 + }, + { + "epoch": 0.02508551881413911, + "grad_norm": 56.75, + "learning_rate": 2.2727272727272729e-07, + "loss": 2.5162, + "mean_token_accuracy": 0.6555981487035751, + "num_tokens": 1179487.0, + "step": 11 + }, + { + "epoch": 0.027366020524515394, + "grad_norm": 50.0, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.5639, + "mean_token_accuracy": 0.6516416519880295, + "num_tokens": 1286278.0, + "step": 12 + }, + { + "epoch": 0.029646522234891677, + "grad_norm": 36.5, + "learning_rate": 2.7272727272727274e-07, + "loss": 2.4399, + "mean_token_accuracy": 0.669710099697113, + "num_tokens": 1393298.0, + "step": 13 + }, + { + "epoch": 0.03192702394526796, + "grad_norm": 39.0, + "learning_rate": 2.954545454545455e-07, + "loss": 2.4725, + "mean_token_accuracy": 0.6617565155029297, + "num_tokens": 1500702.0, + "step": 14 + }, + { + "epoch": 0.03420752565564424, + "grad_norm": 35.5, + "learning_rate": 3.181818181818182e-07, + "loss": 2.4827, + "mean_token_accuracy": 0.656602531671524, + "num_tokens": 1607486.0, + "step": 15 + }, + { + "epoch": 0.036488027366020526, + "grad_norm": 47.0, + "learning_rate": 3.409090909090909e-07, + "loss": 2.5839, + "mean_token_accuracy": 0.6526448428630829, + "num_tokens": 1714872.0, + "step": 16 + }, + { + "epoch": 0.03876852907639681, + "grad_norm": 35.25, + "learning_rate": 3.6363636363636366e-07, + "loss": 2.467, + "mean_token_accuracy": 0.6649379134178162, + "num_tokens": 1822351.0, + "step": 17 + }, + { + "epoch": 0.04104903078677309, + "grad_norm": 40.75, + "learning_rate": 3.8636363636363636e-07, + "loss": 2.5003, + "mean_token_accuracy": 0.6564287096261978, + "num_tokens": 1930114.0, + "step": 18 + }, + { + "epoch": 0.043329532497149374, + "grad_norm": 33.5, + "learning_rate": 4.090909090909091e-07, + "loss": 2.5395, + "mean_token_accuracy": 0.6506040990352631, + "num_tokens": 2038041.0, + "step": 19 + }, + { + "epoch": 0.04561003420752566, + "grad_norm": 32.75, + "learning_rate": 4.3181818181818187e-07, + "loss": 2.5359, + "mean_token_accuracy": 0.6554941684007645, + "num_tokens": 2144846.0, + "step": 20 + }, + { + "epoch": 0.04789053591790194, + "grad_norm": 31.125, + "learning_rate": 4.5454545454545457e-07, + "loss": 2.422, + "mean_token_accuracy": 0.6640460044145584, + "num_tokens": 2252295.0, + "step": 21 + }, + { + "epoch": 0.05017103762827822, + "grad_norm": 37.5, + "learning_rate": 4.772727272727274e-07, + "loss": 2.5004, + "mean_token_accuracy": 0.659651443362236, + "num_tokens": 2359439.0, + "step": 22 + }, + { + "epoch": 0.052451539338654506, + "grad_norm": 33.5, + "learning_rate": 5.000000000000001e-07, + "loss": 2.5621, + "mean_token_accuracy": 0.6528295874595642, + "num_tokens": 2466337.0, + "step": 23 + }, + { + "epoch": 0.05473204104903079, + "grad_norm": 33.0, + "learning_rate": 5.227272727272728e-07, + "loss": 2.4917, + "mean_token_accuracy": 0.661816731095314, + "num_tokens": 2572770.0, + "step": 24 + }, + { + "epoch": 0.05701254275940707, + "grad_norm": 45.0, + "learning_rate": 5.454545454545455e-07, + "loss": 2.402, + "mean_token_accuracy": 0.663006380200386, + "num_tokens": 2680167.0, + "step": 25 + }, + { + "epoch": 0.059293044469783354, + "grad_norm": 44.5, + "learning_rate": 5.681818181818182e-07, + "loss": 2.4679, + "mean_token_accuracy": 0.6532121747732162, + "num_tokens": 2787861.0, + "step": 26 + }, + { + "epoch": 0.06157354618015964, + "grad_norm": 34.0, + "learning_rate": 5.90909090909091e-07, + "loss": 2.5954, + "mean_token_accuracy": 0.64468814432621, + "num_tokens": 2894486.0, + "step": 27 + }, + { + "epoch": 0.06385404789053592, + "grad_norm": 56.5, + "learning_rate": 6.136363636363637e-07, + "loss": 2.5431, + "mean_token_accuracy": 0.6518365442752838, + "num_tokens": 3001693.0, + "step": 28 + }, + { + "epoch": 0.0661345496009122, + "grad_norm": 37.75, + "learning_rate": 6.363636363636364e-07, + "loss": 2.5421, + "mean_token_accuracy": 0.6533436477184296, + "num_tokens": 3108771.0, + "step": 29 + }, + { + "epoch": 0.06841505131128849, + "grad_norm": 31.5, + "learning_rate": 6.590909090909091e-07, + "loss": 2.4357, + "mean_token_accuracy": 0.6666090935468674, + "num_tokens": 3215312.0, + "step": 30 + }, + { + "epoch": 0.07069555302166476, + "grad_norm": 31.5, + "learning_rate": 6.818181818181818e-07, + "loss": 2.4502, + "mean_token_accuracy": 0.6624381989240646, + "num_tokens": 3322496.0, + "step": 31 + }, + { + "epoch": 0.07297605473204105, + "grad_norm": 32.25, + "learning_rate": 7.045454545454545e-07, + "loss": 2.5082, + "mean_token_accuracy": 0.6581026613712311, + "num_tokens": 3429869.0, + "step": 32 + }, + { + "epoch": 0.07525655644241733, + "grad_norm": 32.5, + "learning_rate": 7.272727272727273e-07, + "loss": 2.4792, + "mean_token_accuracy": 0.6572518944740295, + "num_tokens": 3536755.0, + "step": 33 + }, + { + "epoch": 0.07753705815279362, + "grad_norm": 31.75, + "learning_rate": 7.5e-07, + "loss": 2.4916, + "mean_token_accuracy": 0.6520580649375916, + "num_tokens": 3644347.0, + "step": 34 + }, + { + "epoch": 0.07981755986316989, + "grad_norm": 36.0, + "learning_rate": 7.727272727272727e-07, + "loss": 2.4185, + "mean_token_accuracy": 0.6675815433263779, + "num_tokens": 3751037.0, + "step": 35 + }, + { + "epoch": 0.08209806157354618, + "grad_norm": 32.75, + "learning_rate": 7.954545454545455e-07, + "loss": 2.4929, + "mean_token_accuracy": 0.6536126732826233, + "num_tokens": 3857928.0, + "step": 36 + }, + { + "epoch": 0.08437856328392246, + "grad_norm": 31.125, + "learning_rate": 8.181818181818182e-07, + "loss": 2.401, + "mean_token_accuracy": 0.6677189022302628, + "num_tokens": 3965628.0, + "step": 37 + }, + { + "epoch": 0.08665906499429875, + "grad_norm": 31.875, + "learning_rate": 8.409090909090909e-07, + "loss": 2.447, + "mean_token_accuracy": 0.6582602560520172, + "num_tokens": 4072519.0, + "step": 38 + }, + { + "epoch": 0.08893956670467502, + "grad_norm": 31.375, + "learning_rate": 8.636363636363637e-07, + "loss": 2.4231, + "mean_token_accuracy": 0.6686751246452332, + "num_tokens": 4180022.0, + "step": 39 + }, + { + "epoch": 0.09122006841505131, + "grad_norm": 47.25, + "learning_rate": 8.863636363636364e-07, + "loss": 2.4306, + "mean_token_accuracy": 0.6607790142297745, + "num_tokens": 4287045.0, + "step": 40 + }, + { + "epoch": 0.09350057012542759, + "grad_norm": 33.0, + "learning_rate": 9.090909090909091e-07, + "loss": 2.326, + "mean_token_accuracy": 0.6676892191171646, + "num_tokens": 4394995.0, + "step": 41 + }, + { + "epoch": 0.09578107183580388, + "grad_norm": 32.75, + "learning_rate": 9.31818181818182e-07, + "loss": 2.4869, + "mean_token_accuracy": 0.6459519267082214, + "num_tokens": 4501329.0, + "step": 42 + }, + { + "epoch": 0.09806157354618016, + "grad_norm": 31.875, + "learning_rate": 9.545454545454548e-07, + "loss": 2.5084, + "mean_token_accuracy": 0.6592301279306412, + "num_tokens": 4608718.0, + "step": 43 + }, + { + "epoch": 0.10034207525655645, + "grad_norm": 31.375, + "learning_rate": 9.772727272727275e-07, + "loss": 2.4805, + "mean_token_accuracy": 0.6500378400087357, + "num_tokens": 4715658.0, + "step": 44 + }, + { + "epoch": 0.10262257696693272, + "grad_norm": 30.75, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.4318, + "mean_token_accuracy": 0.6553644686937332, + "num_tokens": 4822368.0, + "step": 45 + }, + { + "epoch": 0.10490307867730901, + "grad_norm": 31.625, + "learning_rate": 1.0227272727272729e-06, + "loss": 2.4468, + "mean_token_accuracy": 0.6600785255432129, + "num_tokens": 4929696.0, + "step": 46 + }, + { + "epoch": 0.10718358038768529, + "grad_norm": 30.25, + "learning_rate": 1.0454545454545456e-06, + "loss": 2.3474, + "mean_token_accuracy": 0.6583467125892639, + "num_tokens": 5037621.0, + "step": 47 + }, + { + "epoch": 0.10946408209806158, + "grad_norm": 30.375, + "learning_rate": 1.0681818181818183e-06, + "loss": 2.3681, + "mean_token_accuracy": 0.6633005887269974, + "num_tokens": 5144969.0, + "step": 48 + }, + { + "epoch": 0.11174458380843785, + "grad_norm": 30.25, + "learning_rate": 1.090909090909091e-06, + "loss": 2.4083, + "mean_token_accuracy": 0.6564080715179443, + "num_tokens": 5252296.0, + "step": 49 + }, + { + "epoch": 0.11402508551881414, + "grad_norm": 31.125, + "learning_rate": 1.1136363636363637e-06, + "loss": 2.3943, + "mean_token_accuracy": 0.6526119261980057, + "num_tokens": 5358939.0, + "step": 50 + }, + { + "epoch": 0.11630558722919042, + "grad_norm": 31.125, + "learning_rate": 1.1363636363636364e-06, + "loss": 2.4018, + "mean_token_accuracy": 0.6534301191568375, + "num_tokens": 5465936.0, + "step": 51 + }, + { + "epoch": 0.11858608893956671, + "grad_norm": 30.125, + "learning_rate": 1.159090909090909e-06, + "loss": 2.3053, + "mean_token_accuracy": 0.6658475399017334, + "num_tokens": 5572802.0, + "step": 52 + }, + { + "epoch": 0.12086659064994298, + "grad_norm": 28.625, + "learning_rate": 1.181818181818182e-06, + "loss": 2.2423, + "mean_token_accuracy": 0.6766009479761124, + "num_tokens": 5680067.0, + "step": 53 + }, + { + "epoch": 0.12314709236031927, + "grad_norm": 29.125, + "learning_rate": 1.2045454545454547e-06, + "loss": 2.3047, + "mean_token_accuracy": 0.6613978892564774, + "num_tokens": 5787260.0, + "step": 54 + }, + { + "epoch": 0.12542759407069556, + "grad_norm": 28.875, + "learning_rate": 1.2272727272727274e-06, + "loss": 2.2284, + "mean_token_accuracy": 0.6748173385858536, + "num_tokens": 5894097.0, + "step": 55 + }, + { + "epoch": 0.12770809578107184, + "grad_norm": 28.5, + "learning_rate": 1.25e-06, + "loss": 2.3062, + "mean_token_accuracy": 0.665241539478302, + "num_tokens": 6001210.0, + "step": 56 + }, + { + "epoch": 0.12998859749144812, + "grad_norm": 26.75, + "learning_rate": 1.2727272727272728e-06, + "loss": 2.1486, + "mean_token_accuracy": 0.6805437654256821, + "num_tokens": 6109107.0, + "step": 57 + }, + { + "epoch": 0.1322690992018244, + "grad_norm": 28.25, + "learning_rate": 1.2954545454545455e-06, + "loss": 2.2641, + "mean_token_accuracy": 0.6705231070518494, + "num_tokens": 6216222.0, + "step": 58 + }, + { + "epoch": 0.1345496009122007, + "grad_norm": 28.75, + "learning_rate": 1.3181818181818182e-06, + "loss": 2.2772, + "mean_token_accuracy": 0.6666675806045532, + "num_tokens": 6323452.0, + "step": 59 + }, + { + "epoch": 0.13683010262257697, + "grad_norm": 29.625, + "learning_rate": 1.3409090909090911e-06, + "loss": 2.231, + "mean_token_accuracy": 0.6695638597011566, + "num_tokens": 6430912.0, + "step": 60 + }, + { + "epoch": 0.13911060433295325, + "grad_norm": 27.375, + "learning_rate": 1.3636363636363636e-06, + "loss": 2.2394, + "mean_token_accuracy": 0.6609823554754257, + "num_tokens": 6538019.0, + "step": 61 + }, + { + "epoch": 0.14139110604332952, + "grad_norm": 27.75, + "learning_rate": 1.3863636363636365e-06, + "loss": 2.2672, + "mean_token_accuracy": 0.6677799224853516, + "num_tokens": 6644863.0, + "step": 62 + }, + { + "epoch": 0.14367160775370583, + "grad_norm": 27.5, + "learning_rate": 1.409090909090909e-06, + "loss": 2.2019, + "mean_token_accuracy": 0.66669762134552, + "num_tokens": 6752333.0, + "step": 63 + }, + { + "epoch": 0.1459521094640821, + "grad_norm": 27.0, + "learning_rate": 1.431818181818182e-06, + "loss": 2.2077, + "mean_token_accuracy": 0.6657529324293137, + "num_tokens": 6859405.0, + "step": 64 + }, + { + "epoch": 0.14823261117445838, + "grad_norm": 37.75, + "learning_rate": 1.4545454545454546e-06, + "loss": 2.1799, + "mean_token_accuracy": 0.6656498908996582, + "num_tokens": 6966421.0, + "step": 65 + }, + { + "epoch": 0.15051311288483465, + "grad_norm": 27.25, + "learning_rate": 1.4772727272727275e-06, + "loss": 2.2099, + "mean_token_accuracy": 0.6709634065628052, + "num_tokens": 7073157.0, + "step": 66 + }, + { + "epoch": 0.15279361459521096, + "grad_norm": 29.5, + "learning_rate": 1.5e-06, + "loss": 2.1091, + "mean_token_accuracy": 0.6776281297206879, + "num_tokens": 7180043.0, + "step": 67 + }, + { + "epoch": 0.15507411630558723, + "grad_norm": 27.25, + "learning_rate": 1.522727272727273e-06, + "loss": 2.1265, + "mean_token_accuracy": 0.6751858294010162, + "num_tokens": 7287133.0, + "step": 68 + }, + { + "epoch": 0.1573546180159635, + "grad_norm": 25.875, + "learning_rate": 1.5454545454545454e-06, + "loss": 2.1089, + "mean_token_accuracy": 0.6743377894163132, + "num_tokens": 7394475.0, + "step": 69 + }, + { + "epoch": 0.15963511972633979, + "grad_norm": 29.375, + "learning_rate": 1.5681818181818184e-06, + "loss": 2.0552, + "mean_token_accuracy": 0.6845020353794098, + "num_tokens": 7502908.0, + "step": 70 + }, + { + "epoch": 0.1619156214367161, + "grad_norm": 26.25, + "learning_rate": 1.590909090909091e-06, + "loss": 2.1523, + "mean_token_accuracy": 0.6694881021976471, + "num_tokens": 7609855.0, + "step": 71 + }, + { + "epoch": 0.16419612314709237, + "grad_norm": 26.125, + "learning_rate": 1.613636363636364e-06, + "loss": 2.168, + "mean_token_accuracy": 0.6710262894630432, + "num_tokens": 7716660.0, + "step": 72 + }, + { + "epoch": 0.16647662485746864, + "grad_norm": 25.875, + "learning_rate": 1.6363636363636365e-06, + "loss": 2.0526, + "mean_token_accuracy": 0.6768545061349869, + "num_tokens": 7823567.0, + "step": 73 + }, + { + "epoch": 0.16875712656784492, + "grad_norm": 25.375, + "learning_rate": 1.6590909090909094e-06, + "loss": 2.0024, + "mean_token_accuracy": 0.6849700808525085, + "num_tokens": 7930741.0, + "step": 74 + }, + { + "epoch": 0.17103762827822122, + "grad_norm": 24.375, + "learning_rate": 1.6818181818181819e-06, + "loss": 2.0285, + "mean_token_accuracy": 0.6854856461286545, + "num_tokens": 8037391.0, + "step": 75 + }, + { + "epoch": 0.1733181299885975, + "grad_norm": 24.125, + "learning_rate": 1.7045454545454546e-06, + "loss": 1.9599, + "mean_token_accuracy": 0.6963415741920471, + "num_tokens": 8144926.0, + "step": 76 + }, + { + "epoch": 0.17559863169897377, + "grad_norm": 25.375, + "learning_rate": 1.7272727272727275e-06, + "loss": 1.9961, + "mean_token_accuracy": 0.6796167641878128, + "num_tokens": 8251774.0, + "step": 77 + }, + { + "epoch": 0.17787913340935005, + "grad_norm": 23.25, + "learning_rate": 1.75e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.7025072127580643, + "num_tokens": 8358808.0, + "step": 78 + }, + { + "epoch": 0.18015963511972635, + "grad_norm": 23.625, + "learning_rate": 1.7727272727272729e-06, + "loss": 1.8932, + "mean_token_accuracy": 0.6979697048664093, + "num_tokens": 8466050.0, + "step": 79 + }, + { + "epoch": 0.18244013683010263, + "grad_norm": 24.25, + "learning_rate": 1.7954545454545456e-06, + "loss": 1.9363, + "mean_token_accuracy": 0.6897751837968826, + "num_tokens": 8572892.0, + "step": 80 + }, + { + "epoch": 0.1847206385404789, + "grad_norm": 23.375, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.9175, + "mean_token_accuracy": 0.6948112696409225, + "num_tokens": 8679995.0, + "step": 81 + }, + { + "epoch": 0.18700114025085518, + "grad_norm": 22.625, + "learning_rate": 1.840909090909091e-06, + "loss": 1.825, + "mean_token_accuracy": 0.7063078880310059, + "num_tokens": 8787388.0, + "step": 82 + }, + { + "epoch": 0.18928164196123148, + "grad_norm": 21.875, + "learning_rate": 1.863636363636364e-06, + "loss": 1.8128, + "mean_token_accuracy": 0.7035099864006042, + "num_tokens": 8894141.0, + "step": 83 + }, + { + "epoch": 0.19156214367160776, + "grad_norm": 22.375, + "learning_rate": 1.8863636363636364e-06, + "loss": 1.8433, + "mean_token_accuracy": 0.6927594691514969, + "num_tokens": 9000732.0, + "step": 84 + }, + { + "epoch": 0.19384264538198404, + "grad_norm": 21.5, + "learning_rate": 1.9090909090909095e-06, + "loss": 1.7329, + "mean_token_accuracy": 0.7171757072210312, + "num_tokens": 9107855.0, + "step": 85 + }, + { + "epoch": 0.1961231470923603, + "grad_norm": 21.5, + "learning_rate": 1.931818181818182e-06, + "loss": 1.7188, + "mean_token_accuracy": 0.7101240158081055, + "num_tokens": 9215544.0, + "step": 86 + }, + { + "epoch": 0.19840364880273662, + "grad_norm": 21.625, + "learning_rate": 1.954545454545455e-06, + "loss": 1.7445, + "mean_token_accuracy": 0.7046354413032532, + "num_tokens": 9322163.0, + "step": 87 + }, + { + "epoch": 0.2006841505131129, + "grad_norm": 31.75, + "learning_rate": 1.977272727272727e-06, + "loss": 1.6605, + "mean_token_accuracy": 0.7182554006576538, + "num_tokens": 9428959.0, + "step": 88 + }, + { + "epoch": 0.20296465222348917, + "grad_norm": 20.625, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.6955, + "mean_token_accuracy": 0.7105437815189362, + "num_tokens": 9535594.0, + "step": 89 + }, + { + "epoch": 0.20524515393386544, + "grad_norm": 20.5, + "learning_rate": 2.022727272727273e-06, + "loss": 1.6534, + "mean_token_accuracy": 0.7167372107505798, + "num_tokens": 9642716.0, + "step": 90 + }, + { + "epoch": 0.20752565564424175, + "grad_norm": 20.125, + "learning_rate": 2.0454545454545457e-06, + "loss": 1.6508, + "mean_token_accuracy": 0.7186834812164307, + "num_tokens": 9749032.0, + "step": 91 + }, + { + "epoch": 0.20980615735461802, + "grad_norm": 19.625, + "learning_rate": 2.0681818181818184e-06, + "loss": 1.6197, + "mean_token_accuracy": 0.7244506776332855, + "num_tokens": 9855340.0, + "step": 92 + }, + { + "epoch": 0.2120866590649943, + "grad_norm": 19.5, + "learning_rate": 2.090909090909091e-06, + "loss": 1.5969, + "mean_token_accuracy": 0.7231418937444687, + "num_tokens": 9962265.0, + "step": 93 + }, + { + "epoch": 0.21436716077537057, + "grad_norm": 18.5, + "learning_rate": 2.113636363636364e-06, + "loss": 1.5364, + "mean_token_accuracy": 0.7311695367097855, + "num_tokens": 10069156.0, + "step": 94 + }, + { + "epoch": 0.21664766248574688, + "grad_norm": 17.625, + "learning_rate": 2.1363636363636365e-06, + "loss": 1.4815, + "mean_token_accuracy": 0.7454914003610611, + "num_tokens": 10176688.0, + "step": 95 + }, + { + "epoch": 0.21892816419612315, + "grad_norm": 17.625, + "learning_rate": 2.1590909090909092e-06, + "loss": 1.4978, + "mean_token_accuracy": 0.7258649468421936, + "num_tokens": 10283306.0, + "step": 96 + }, + { + "epoch": 0.22120866590649943, + "grad_norm": 17.5, + "learning_rate": 2.181818181818182e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.7376868277788162, + "num_tokens": 10390084.0, + "step": 97 + }, + { + "epoch": 0.2234891676168757, + "grad_norm": 17.125, + "learning_rate": 2.2045454545454547e-06, + "loss": 1.4221, + "mean_token_accuracy": 0.7373760044574738, + "num_tokens": 10497627.0, + "step": 98 + }, + { + "epoch": 0.22576966932725198, + "grad_norm": 23.875, + "learning_rate": 2.2272727272727274e-06, + "loss": 1.3852, + "mean_token_accuracy": 0.7427248060703278, + "num_tokens": 10604483.0, + "step": 99 + }, + { + "epoch": 0.22805017103762829, + "grad_norm": 18.0, + "learning_rate": 2.25e-06, + "loss": 1.3928, + "mean_token_accuracy": 0.7454716116189957, + "num_tokens": 10711923.0, + "step": 100 + }, + { + "epoch": 0.23033067274800456, + "grad_norm": 19.375, + "learning_rate": 2.2727272727272728e-06, + "loss": 1.3648, + "mean_token_accuracy": 0.749587893486023, + "num_tokens": 10818737.0, + "step": 101 + }, + { + "epoch": 0.23261117445838084, + "grad_norm": 21.5, + "learning_rate": 2.295454545454546e-06, + "loss": 1.3399, + "mean_token_accuracy": 0.7486944496631622, + "num_tokens": 10925328.0, + "step": 102 + }, + { + "epoch": 0.2348916761687571, + "grad_norm": 14.9375, + "learning_rate": 2.318181818181818e-06, + "loss": 1.2484, + "mean_token_accuracy": 0.7670646905899048, + "num_tokens": 11032717.0, + "step": 103 + }, + { + "epoch": 0.23717217787913342, + "grad_norm": 14.9375, + "learning_rate": 2.3409090909090913e-06, + "loss": 1.2661, + "mean_token_accuracy": 0.7583093047142029, + "num_tokens": 11139345.0, + "step": 104 + }, + { + "epoch": 0.2394526795895097, + "grad_norm": 14.375, + "learning_rate": 2.363636363636364e-06, + "loss": 1.2064, + "mean_token_accuracy": 0.7690076977014542, + "num_tokens": 11245962.0, + "step": 105 + }, + { + "epoch": 0.24173318129988597, + "grad_norm": 13.9375, + "learning_rate": 2.3863636363636367e-06, + "loss": 1.2029, + "mean_token_accuracy": 0.7617898732423782, + "num_tokens": 11352972.0, + "step": 106 + }, + { + "epoch": 0.24401368301026224, + "grad_norm": 13.75, + "learning_rate": 2.4090909090909094e-06, + "loss": 1.153, + "mean_token_accuracy": 0.7770701050758362, + "num_tokens": 11460132.0, + "step": 107 + }, + { + "epoch": 0.24629418472063855, + "grad_norm": 13.8125, + "learning_rate": 2.431818181818182e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.7785277962684631, + "num_tokens": 11567105.0, + "step": 108 + }, + { + "epoch": 0.24857468643101482, + "grad_norm": 12.5625, + "learning_rate": 2.454545454545455e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.7852117866277695, + "num_tokens": 11674498.0, + "step": 109 + }, + { + "epoch": 0.2508551881413911, + "grad_norm": 12.625, + "learning_rate": 2.4772727272727275e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.7985803633928299, + "num_tokens": 11780696.0, + "step": 110 + }, + { + "epoch": 0.2531356898517674, + "grad_norm": 12.0, + "learning_rate": 2.5e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.7939664274454117, + "num_tokens": 11887719.0, + "step": 111 + }, + { + "epoch": 0.2554161915621437, + "grad_norm": 10.875, + "learning_rate": 2.522727272727273e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.812217116355896, + "num_tokens": 11994518.0, + "step": 112 + }, + { + "epoch": 0.2576966932725199, + "grad_norm": 10.4375, + "learning_rate": 2.5454545454545456e-06, + "loss": 1.009, + "mean_token_accuracy": 0.8142502456903458, + "num_tokens": 12101625.0, + "step": 113 + }, + { + "epoch": 0.25997719498289623, + "grad_norm": 9.5625, + "learning_rate": 2.5681818181818187e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.8154999315738678, + "num_tokens": 12208599.0, + "step": 114 + }, + { + "epoch": 0.26225769669327254, + "grad_norm": 9.0625, + "learning_rate": 2.590909090909091e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.8178321123123169, + "num_tokens": 12315919.0, + "step": 115 + }, + { + "epoch": 0.2645381984036488, + "grad_norm": 8.25, + "learning_rate": 2.6136363636363637e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.8180135637521744, + "num_tokens": 12422670.0, + "step": 116 + }, + { + "epoch": 0.2668187001140251, + "grad_norm": 7.625, + "learning_rate": 2.6363636363636364e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.8282536566257477, + "num_tokens": 12529416.0, + "step": 117 + }, + { + "epoch": 0.2690992018244014, + "grad_norm": 13.1875, + "learning_rate": 2.6590909090909095e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.8260611891746521, + "num_tokens": 12636798.0, + "step": 118 + }, + { + "epoch": 0.27137970353477764, + "grad_norm": 6.9375, + "learning_rate": 2.6818181818181822e-06, + "loss": 0.896, + "mean_token_accuracy": 0.8283491432666779, + "num_tokens": 12743702.0, + "step": 119 + }, + { + "epoch": 0.27366020524515394, + "grad_norm": 6.6875, + "learning_rate": 2.7045454545454545e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.8232417404651642, + "num_tokens": 12851550.0, + "step": 120 + }, + { + "epoch": 0.2759407069555302, + "grad_norm": 6.0625, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.8310881853103638, + "num_tokens": 12958606.0, + "step": 121 + }, + { + "epoch": 0.2782212086659065, + "grad_norm": 5.21875, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.8362232446670532, + "num_tokens": 13065521.0, + "step": 122 + }, + { + "epoch": 0.2805017103762828, + "grad_norm": 5.15625, + "learning_rate": 2.772727272727273e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.8403038382530212, + "num_tokens": 13172067.0, + "step": 123 + }, + { + "epoch": 0.28278221208665905, + "grad_norm": 4.75, + "learning_rate": 2.7954545454545458e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.8477163910865784, + "num_tokens": 13279427.0, + "step": 124 + }, + { + "epoch": 0.28506271379703535, + "grad_norm": 4.78125, + "learning_rate": 2.818181818181818e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.8398231416940689, + "num_tokens": 13386332.0, + "step": 125 + }, + { + "epoch": 0.28734321550741165, + "grad_norm": 4.625, + "learning_rate": 2.8409090909090916e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.8417368233203888, + "num_tokens": 13493485.0, + "step": 126 + }, + { + "epoch": 0.2896237172177879, + "grad_norm": 4.40625, + "learning_rate": 2.863636363636364e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.8364972919225693, + "num_tokens": 13600526.0, + "step": 127 + }, + { + "epoch": 0.2919042189281642, + "grad_norm": 4.21875, + "learning_rate": 2.8863636363636366e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.8411660343408585, + "num_tokens": 13707335.0, + "step": 128 + }, + { + "epoch": 0.29418472063854045, + "grad_norm": 3.9375, + "learning_rate": 2.9090909090909093e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.840514287352562, + "num_tokens": 13814123.0, + "step": 129 + }, + { + "epoch": 0.29646522234891676, + "grad_norm": 3.890625, + "learning_rate": 2.931818181818182e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.8397897183895111, + "num_tokens": 13921444.0, + "step": 130 + }, + { + "epoch": 0.29874572405929306, + "grad_norm": 3.84375, + "learning_rate": 2.954545454545455e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.8480268269777298, + "num_tokens": 14028739.0, + "step": 131 + }, + { + "epoch": 0.3010262257696693, + "grad_norm": 4.15625, + "learning_rate": 2.9772727272727274e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.8492945581674576, + "num_tokens": 14136015.0, + "step": 132 + }, + { + "epoch": 0.3033067274800456, + "grad_norm": 4.46875, + "learning_rate": 3e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.8442358523607254, + "num_tokens": 14243016.0, + "step": 133 + }, + { + "epoch": 0.3055872291904219, + "grad_norm": 4.34375, + "learning_rate": 3.0227272727272728e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.8423920571804047, + "num_tokens": 14350319.0, + "step": 134 + }, + { + "epoch": 0.30786773090079816, + "grad_norm": 3.609375, + "learning_rate": 3.045454545454546e-06, + "loss": 0.767, + "mean_token_accuracy": 0.845877543091774, + "num_tokens": 14457699.0, + "step": 135 + }, + { + "epoch": 0.31014823261117447, + "grad_norm": 3.390625, + "learning_rate": 3.0681818181818186e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.8514658808708191, + "num_tokens": 14565228.0, + "step": 136 + }, + { + "epoch": 0.3124287343215507, + "grad_norm": 3.71875, + "learning_rate": 3.090909090909091e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.8524320423603058, + "num_tokens": 14671853.0, + "step": 137 + }, + { + "epoch": 0.314709236031927, + "grad_norm": 3.625, + "learning_rate": 3.1136363636363636e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.8469192087650299, + "num_tokens": 14778717.0, + "step": 138 + }, + { + "epoch": 0.3169897377423033, + "grad_norm": 4.03125, + "learning_rate": 3.1363636363636367e-06, + "loss": 0.751, + "mean_token_accuracy": 0.847698837518692, + "num_tokens": 14885625.0, + "step": 139 + }, + { + "epoch": 0.31927023945267957, + "grad_norm": 3.359375, + "learning_rate": 3.1590909090909094e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.8514856845140457, + "num_tokens": 14992824.0, + "step": 140 + }, + { + "epoch": 0.3215507411630559, + "grad_norm": 4.28125, + "learning_rate": 3.181818181818182e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.8546594232320786, + "num_tokens": 15100514.0, + "step": 141 + }, + { + "epoch": 0.3238312428734322, + "grad_norm": 6.03125, + "learning_rate": 3.204545454545455e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.8488231599330902, + "num_tokens": 15208020.0, + "step": 142 + }, + { + "epoch": 0.3261117445838084, + "grad_norm": 4.875, + "learning_rate": 3.227272727272728e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.8457832187414169, + "num_tokens": 15314965.0, + "step": 143 + }, + { + "epoch": 0.32839224629418473, + "grad_norm": 3.5625, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.8558014631271362, + "num_tokens": 15421585.0, + "step": 144 + }, + { + "epoch": 0.330672748004561, + "grad_norm": 3.109375, + "learning_rate": 3.272727272727273e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.8494018763303757, + "num_tokens": 15528412.0, + "step": 145 + }, + { + "epoch": 0.3329532497149373, + "grad_norm": 3.21875, + "learning_rate": 3.2954545454545456e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.8540607988834381, + "num_tokens": 15635689.0, + "step": 146 + }, + { + "epoch": 0.3352337514253136, + "grad_norm": 3.28125, + "learning_rate": 3.3181818181818188e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.856657013297081, + "num_tokens": 15742801.0, + "step": 147 + }, + { + "epoch": 0.33751425313568983, + "grad_norm": 3.953125, + "learning_rate": 3.3409090909090915e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.8510304987430573, + "num_tokens": 15850125.0, + "step": 148 + }, + { + "epoch": 0.33979475484606614, + "grad_norm": 3.625, + "learning_rate": 3.3636363636363637e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.84682796895504, + "num_tokens": 15956807.0, + "step": 149 + }, + { + "epoch": 0.34207525655644244, + "grad_norm": 3.578125, + "learning_rate": 3.3863636363636364e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.85152368247509, + "num_tokens": 16063797.0, + "step": 150 + }, + { + "epoch": 0.3443557582668187, + "grad_norm": 3.53125, + "learning_rate": 3.409090909090909e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.8609962910413742, + "num_tokens": 16170407.0, + "step": 151 + }, + { + "epoch": 0.346636259977195, + "grad_norm": 3.484375, + "learning_rate": 3.4318181818181823e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.8567307740449905, + "num_tokens": 16276735.0, + "step": 152 + }, + { + "epoch": 0.34891676168757124, + "grad_norm": 3.375, + "learning_rate": 3.454545454545455e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.8543938845396042, + "num_tokens": 16383665.0, + "step": 153 + }, + { + "epoch": 0.35119726339794755, + "grad_norm": 3.984375, + "learning_rate": 3.4772727272727277e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.8494099676609039, + "num_tokens": 16490654.0, + "step": 154 + }, + { + "epoch": 0.35347776510832385, + "grad_norm": 3.53125, + "learning_rate": 3.5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.8554576933383942, + "num_tokens": 16597928.0, + "step": 155 + }, + { + "epoch": 0.3557582668187001, + "grad_norm": 3.4375, + "learning_rate": 3.522727272727273e-06, + "loss": 0.666, + "mean_token_accuracy": 0.8644505739212036, + "num_tokens": 16705070.0, + "step": 156 + }, + { + "epoch": 0.3580387685290764, + "grad_norm": 4.6875, + "learning_rate": 3.5454545454545458e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.8530746251344681, + "num_tokens": 16812219.0, + "step": 157 + }, + { + "epoch": 0.3603192702394527, + "grad_norm": 3.125, + "learning_rate": 3.5681818181818185e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.8598745912313461, + "num_tokens": 16919765.0, + "step": 158 + }, + { + "epoch": 0.36259977194982895, + "grad_norm": 2.84375, + "learning_rate": 3.590909090909091e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.8543113619089127, + "num_tokens": 17026571.0, + "step": 159 + }, + { + "epoch": 0.36488027366020526, + "grad_norm": 3.265625, + "learning_rate": 3.6136363636363643e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.8556502610445023, + "num_tokens": 17133437.0, + "step": 160 + }, + { + "epoch": 0.3671607753705815, + "grad_norm": 2.578125, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.686, + "mean_token_accuracy": 0.859677642583847, + "num_tokens": 17240745.0, + "step": 161 + }, + { + "epoch": 0.3694412770809578, + "grad_norm": 2.453125, + "learning_rate": 3.6590909090909093e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.8606048673391342, + "num_tokens": 17347700.0, + "step": 162 + }, + { + "epoch": 0.3717217787913341, + "grad_norm": 3.015625, + "learning_rate": 3.681818181818182e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.8614284843206406, + "num_tokens": 17455182.0, + "step": 163 + }, + { + "epoch": 0.37400228050171036, + "grad_norm": 2.671875, + "learning_rate": 3.704545454545455e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.8534423410892487, + "num_tokens": 17562182.0, + "step": 164 + }, + { + "epoch": 0.37628278221208666, + "grad_norm": 2.46875, + "learning_rate": 3.727272727272728e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.855252206325531, + "num_tokens": 17668986.0, + "step": 165 + }, + { + "epoch": 0.37856328392246297, + "grad_norm": 2.625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.8632187992334366, + "num_tokens": 17776052.0, + "step": 166 + }, + { + "epoch": 0.3808437856328392, + "grad_norm": 2.65625, + "learning_rate": 3.772727272727273e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.854138657450676, + "num_tokens": 17883630.0, + "step": 167 + }, + { + "epoch": 0.3831242873432155, + "grad_norm": 2.546875, + "learning_rate": 3.7954545454545455e-06, + "loss": 0.6734, + "mean_token_accuracy": 0.8595366477966309, + "num_tokens": 17990666.0, + "step": 168 + }, + { + "epoch": 0.38540478905359177, + "grad_norm": 2.890625, + "learning_rate": 3.818181818181819e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.8539352118968964, + "num_tokens": 18097695.0, + "step": 169 + }, + { + "epoch": 0.38768529076396807, + "grad_norm": 2.515625, + "learning_rate": 3.840909090909091e-06, + "loss": 0.691, + "mean_token_accuracy": 0.8536562919616699, + "num_tokens": 18204503.0, + "step": 170 + }, + { + "epoch": 0.3899657924743444, + "grad_norm": 2.46875, + "learning_rate": 3.863636363636364e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.8611757457256317, + "num_tokens": 18311196.0, + "step": 171 + }, + { + "epoch": 0.3922462941847206, + "grad_norm": 3.125, + "learning_rate": 3.886363636363637e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.8503219485282898, + "num_tokens": 18417786.0, + "step": 172 + }, + { + "epoch": 0.3945267958950969, + "grad_norm": 2.0625, + "learning_rate": 3.90909090909091e-06, + "loss": 0.6665, + "mean_token_accuracy": 0.8595822900533676, + "num_tokens": 18525225.0, + "step": 173 + }, + { + "epoch": 0.39680729760547323, + "grad_norm": 2.984375, + "learning_rate": 3.931818181818182e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.8568900525569916, + "num_tokens": 18632231.0, + "step": 174 + }, + { + "epoch": 0.3990877993158495, + "grad_norm": 2.3125, + "learning_rate": 3.954545454545454e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.8539032638072968, + "num_tokens": 18739610.0, + "step": 175 + }, + { + "epoch": 0.4013683010262258, + "grad_norm": 2.6875, + "learning_rate": 3.9772727272727275e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.8602974861860275, + "num_tokens": 18846794.0, + "step": 176 + }, + { + "epoch": 0.40364880273660203, + "grad_norm": 2.484375, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.8603723049163818, + "num_tokens": 18953693.0, + "step": 177 + }, + { + "epoch": 0.40592930444697833, + "grad_norm": 3.265625, + "learning_rate": 4.022727272727273e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.8616438657045364, + "num_tokens": 19060250.0, + "step": 178 + }, + { + "epoch": 0.40820980615735464, + "grad_norm": 2.71875, + "learning_rate": 4.045454545454546e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.8587404191493988, + "num_tokens": 19167003.0, + "step": 179 + }, + { + "epoch": 0.4104903078677309, + "grad_norm": 2.265625, + "learning_rate": 4.068181818181818e-06, + "loss": 0.665, + "mean_token_accuracy": 0.8621865957975388, + "num_tokens": 19274555.0, + "step": 180 + }, + { + "epoch": 0.4127708095781072, + "grad_norm": 2.921875, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.8582023978233337, + "num_tokens": 19381948.0, + "step": 181 + }, + { + "epoch": 0.4150513112884835, + "grad_norm": 3.171875, + "learning_rate": 4.113636363636364e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.8636541664600372, + "num_tokens": 19489531.0, + "step": 182 + }, + { + "epoch": 0.41733181299885974, + "grad_norm": 2.53125, + "learning_rate": 4.136363636363637e-06, + "loss": 0.6553, + "mean_token_accuracy": 0.8614503294229507, + "num_tokens": 19596788.0, + "step": 183 + }, + { + "epoch": 0.41961231470923605, + "grad_norm": 2.375, + "learning_rate": 4.159090909090909e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.865259125828743, + "num_tokens": 19703381.0, + "step": 184 + }, + { + "epoch": 0.4218928164196123, + "grad_norm": 2.515625, + "learning_rate": 4.181818181818182e-06, + "loss": 0.675, + "mean_token_accuracy": 0.8583291471004486, + "num_tokens": 19810512.0, + "step": 185 + }, + { + "epoch": 0.4241733181299886, + "grad_norm": 2.21875, + "learning_rate": 4.204545454545455e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.8560606837272644, + "num_tokens": 19917996.0, + "step": 186 + }, + { + "epoch": 0.4264538198403649, + "grad_norm": 3.015625, + "learning_rate": 4.227272727272728e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.8619653731584549, + "num_tokens": 20025261.0, + "step": 187 + }, + { + "epoch": 0.42873432155074115, + "grad_norm": 2.546875, + "learning_rate": 4.25e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.8561718165874481, + "num_tokens": 20131720.0, + "step": 188 + }, + { + "epoch": 0.43101482326111745, + "grad_norm": 2.921875, + "learning_rate": 4.272727272727273e-06, + "loss": 0.6727, + "mean_token_accuracy": 0.8626906722784042, + "num_tokens": 20238797.0, + "step": 189 + }, + { + "epoch": 0.43329532497149376, + "grad_norm": 2.421875, + "learning_rate": 4.295454545454546e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.8580966591835022, + "num_tokens": 20345319.0, + "step": 190 + }, + { + "epoch": 0.43557582668187, + "grad_norm": 2.71875, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.8572024405002594, + "num_tokens": 20452421.0, + "step": 191 + }, + { + "epoch": 0.4378563283922463, + "grad_norm": 4.0, + "learning_rate": 4.340909090909091e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.862261489033699, + "num_tokens": 20559771.0, + "step": 192 + }, + { + "epoch": 0.44013683010262256, + "grad_norm": 2.75, + "learning_rate": 4.363636363636364e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.8651402294635773, + "num_tokens": 20667009.0, + "step": 193 + }, + { + "epoch": 0.44241733181299886, + "grad_norm": 2.84375, + "learning_rate": 4.386363636363637e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.8569408059120178, + "num_tokens": 20774389.0, + "step": 194 + }, + { + "epoch": 0.44469783352337516, + "grad_norm": 4.34375, + "learning_rate": 4.409090909090909e-06, + "loss": 0.667, + "mean_token_accuracy": 0.8601485341787338, + "num_tokens": 20881703.0, + "step": 195 + }, + { + "epoch": 0.4469783352337514, + "grad_norm": 3.875, + "learning_rate": 4.4318181818181824e-06, + "loss": 0.656, + "mean_token_accuracy": 0.8644906729459763, + "num_tokens": 20988710.0, + "step": 196 + }, + { + "epoch": 0.4492588369441277, + "grad_norm": 2.859375, + "learning_rate": 4.454545454545455e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.8626122176647186, + "num_tokens": 21095340.0, + "step": 197 + }, + { + "epoch": 0.45153933865450396, + "grad_norm": 3.40625, + "learning_rate": 4.477272727272728e-06, + "loss": 0.6547, + "mean_token_accuracy": 0.8651628345251083, + "num_tokens": 21202493.0, + "step": 198 + }, + { + "epoch": 0.45381984036488027, + "grad_norm": 3.4375, + "learning_rate": 4.5e-06, + "loss": 0.6607, + "mean_token_accuracy": 0.8622200191020966, + "num_tokens": 21309591.0, + "step": 199 + }, + { + "epoch": 0.45610034207525657, + "grad_norm": 2.46875, + "learning_rate": 4.522727272727273e-06, + "loss": 0.6405, + "mean_token_accuracy": 0.8680934458971024, + "num_tokens": 21417007.0, + "step": 200 + }, + { + "epoch": 0.4583808437856328, + "grad_norm": 3.53125, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.8650902211666107, + "num_tokens": 21524653.0, + "step": 201 + }, + { + "epoch": 0.4606613454960091, + "grad_norm": 4.0625, + "learning_rate": 4.568181818181819e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.8640954792499542, + "num_tokens": 21631985.0, + "step": 202 + }, + { + "epoch": 0.4629418472063854, + "grad_norm": 2.4375, + "learning_rate": 4.590909090909092e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.8592006415128708, + "num_tokens": 21738904.0, + "step": 203 + }, + { + "epoch": 0.4652223489167617, + "grad_norm": 2.65625, + "learning_rate": 4.613636363636364e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.8670714944601059, + "num_tokens": 21846587.0, + "step": 204 + }, + { + "epoch": 0.467502850627138, + "grad_norm": 2.890625, + "learning_rate": 4.636363636363636e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.861112967133522, + "num_tokens": 21953786.0, + "step": 205 + }, + { + "epoch": 0.4697833523375142, + "grad_norm": 4.125, + "learning_rate": 4.6590909090909095e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.860434964299202, + "num_tokens": 22061456.0, + "step": 206 + }, + { + "epoch": 0.47206385404789053, + "grad_norm": 2.34375, + "learning_rate": 4.681818181818183e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.8619724959135056, + "num_tokens": 22169315.0, + "step": 207 + }, + { + "epoch": 0.47434435575826683, + "grad_norm": 2.6875, + "learning_rate": 4.704545454545455e-06, + "loss": 0.6591, + "mean_token_accuracy": 0.8640032708644867, + "num_tokens": 22276762.0, + "step": 208 + }, + { + "epoch": 0.4766248574686431, + "grad_norm": 2.703125, + "learning_rate": 4.727272727272728e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.8638423681259155, + "num_tokens": 22383321.0, + "step": 209 + }, + { + "epoch": 0.4789053591790194, + "grad_norm": 2.03125, + "learning_rate": 4.75e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.8663541227579117, + "num_tokens": 22490163.0, + "step": 210 + }, + { + "epoch": 0.4811858608893957, + "grad_norm": 1.8984375, + "learning_rate": 4.772727272727273e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.8662288784980774, + "num_tokens": 22596745.0, + "step": 211 + }, + { + "epoch": 0.48346636259977194, + "grad_norm": 3.296875, + "learning_rate": 4.795454545454546e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.8609149158000946, + "num_tokens": 22704241.0, + "step": 212 + }, + { + "epoch": 0.48574686431014824, + "grad_norm": 2.296875, + "learning_rate": 4.818181818181819e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.8637297302484512, + "num_tokens": 22810936.0, + "step": 213 + }, + { + "epoch": 0.4880273660205245, + "grad_norm": 2.21875, + "learning_rate": 4.840909090909091e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.8652000278234482, + "num_tokens": 22917635.0, + "step": 214 + }, + { + "epoch": 0.4903078677309008, + "grad_norm": 3.921875, + "learning_rate": 4.863636363636364e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.8612215965986252, + "num_tokens": 23024940.0, + "step": 215 + }, + { + "epoch": 0.4925883694412771, + "grad_norm": 3.796875, + "learning_rate": 4.8863636363636365e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.8561449646949768, + "num_tokens": 23132260.0, + "step": 216 + }, + { + "epoch": 0.49486887115165334, + "grad_norm": 2.734375, + "learning_rate": 4.90909090909091e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.8678651452064514, + "num_tokens": 23239311.0, + "step": 217 + }, + { + "epoch": 0.49714937286202965, + "grad_norm": 2.6875, + "learning_rate": 4.931818181818182e-06, + "loss": 0.6485, + "mean_token_accuracy": 0.8589539378881454, + "num_tokens": 23346340.0, + "step": 218 + }, + { + "epoch": 0.49942987457240595, + "grad_norm": 2.15625, + "learning_rate": 4.954545454545455e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.8672493249177933, + "num_tokens": 23454269.0, + "step": 219 + }, + { + "epoch": 0.5017103762827823, + "grad_norm": 2.53125, + "learning_rate": 4.977272727272728e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.8612665235996246, + "num_tokens": 23560711.0, + "step": 220 + }, + { + "epoch": 0.5017103762827823, + "eval_loss": 0.6530081033706665, + "eval_mean_token_accuracy": 0.8651766348701013, + "eval_num_tokens": 23560711.0, + "eval_runtime": 58.5845, + "eval_samples_per_second": 143.126, + "eval_steps_per_second": 4.489, + "step": 220 + }, + { + "epoch": 0.5039908779931584, + "grad_norm": 2.296875, + "learning_rate": 5e-06, + "loss": 0.62, + "mean_token_accuracy": 0.8713521808385849, + "num_tokens": 23668025.0, + "step": 221 + }, + { + "epoch": 0.5062713797035348, + "grad_norm": 1.890625, + "learning_rate": 4.999999290524132e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.8714602589607239, + "num_tokens": 23775006.0, + "step": 222 + }, + { + "epoch": 0.508551881413911, + "grad_norm": 2.703125, + "learning_rate": 4.999997162096932e-06, + "loss": 0.6736, + "mean_token_accuracy": 0.8643509894609451, + "num_tokens": 23882355.0, + "step": 223 + }, + { + "epoch": 0.5108323831242874, + "grad_norm": 2.9375, + "learning_rate": 4.999993614719606e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.8682572692632675, + "num_tokens": 23989600.0, + "step": 224 + }, + { + "epoch": 0.5131128848346637, + "grad_norm": 2.140625, + "learning_rate": 4.999988648394169e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8725505918264389, + "num_tokens": 24097243.0, + "step": 225 + }, + { + "epoch": 0.5153933865450399, + "grad_norm": 2.890625, + "learning_rate": 4.99998226312344e-06, + "loss": 0.6531, + "mean_token_accuracy": 0.8634347319602966, + "num_tokens": 24203953.0, + "step": 226 + }, + { + "epoch": 0.5176738882554162, + "grad_norm": 2.546875, + "learning_rate": 4.999974458911041e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8699511885643005, + "num_tokens": 24310863.0, + "step": 227 + }, + { + "epoch": 0.5199543899657925, + "grad_norm": 2.484375, + "learning_rate": 4.999965235761404e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.8652396500110626, + "num_tokens": 24417755.0, + "step": 228 + }, + { + "epoch": 0.5222348916761688, + "grad_norm": 2.34375, + "learning_rate": 4.999954593679762e-06, + "loss": 0.659, + "mean_token_accuracy": 0.864411011338234, + "num_tokens": 24524785.0, + "step": 229 + }, + { + "epoch": 0.5245153933865451, + "grad_norm": 4.3125, + "learning_rate": 4.999942532672157e-06, + "loss": 0.688, + "mean_token_accuracy": 0.854931503534317, + "num_tokens": 24631291.0, + "step": 230 + }, + { + "epoch": 0.5267958950969214, + "grad_norm": 2.859375, + "learning_rate": 4.999929052745434e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.8636079728603363, + "num_tokens": 24738568.0, + "step": 231 + }, + { + "epoch": 0.5290763968072976, + "grad_norm": 3.953125, + "learning_rate": 4.999914153907243e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.8641977459192276, + "num_tokens": 24845646.0, + "step": 232 + }, + { + "epoch": 0.5313568985176739, + "grad_norm": 2.1875, + "learning_rate": 4.999897836166041e-06, + "loss": 0.6473, + "mean_token_accuracy": 0.8661595731973648, + "num_tokens": 24952336.0, + "step": 233 + }, + { + "epoch": 0.5336374002280502, + "grad_norm": 2.6875, + "learning_rate": 4.999880099531089e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.8643601685762405, + "num_tokens": 25058478.0, + "step": 234 + }, + { + "epoch": 0.5359179019384265, + "grad_norm": 2.890625, + "learning_rate": 4.999860944012455e-06, + "loss": 0.6463, + "mean_token_accuracy": 0.8663338273763657, + "num_tokens": 25165512.0, + "step": 235 + }, + { + "epoch": 0.5381984036488028, + "grad_norm": 2.390625, + "learning_rate": 4.999840369621011e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.8652057200670242, + "num_tokens": 25272671.0, + "step": 236 + }, + { + "epoch": 0.540478905359179, + "grad_norm": 2.734375, + "learning_rate": 4.999818376368435e-06, + "loss": 0.625, + "mean_token_accuracy": 0.8729881942272186, + "num_tokens": 25379431.0, + "step": 237 + }, + { + "epoch": 0.5427594070695553, + "grad_norm": 2.15625, + "learning_rate": 4.999794964267208e-06, + "loss": 0.6275, + "mean_token_accuracy": 0.8688799887895584, + "num_tokens": 25486822.0, + "step": 238 + }, + { + "epoch": 0.5450399087799316, + "grad_norm": 2.015625, + "learning_rate": 4.9997701333306215e-06, + "loss": 0.6471, + "mean_token_accuracy": 0.8656931668519974, + "num_tokens": 25594363.0, + "step": 239 + }, + { + "epoch": 0.5473204104903079, + "grad_norm": 5.34375, + "learning_rate": 4.999743883572766e-06, + "loss": 0.6471, + "mean_token_accuracy": 0.8659048229455948, + "num_tokens": 25701619.0, + "step": 240 + }, + { + "epoch": 0.5496009122006842, + "grad_norm": 4.125, + "learning_rate": 4.999716215008542e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.866451695561409, + "num_tokens": 25808261.0, + "step": 241 + }, + { + "epoch": 0.5518814139110604, + "grad_norm": 2.359375, + "learning_rate": 4.999687127653654e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.8627986013889313, + "num_tokens": 25915402.0, + "step": 242 + }, + { + "epoch": 0.5541619156214367, + "grad_norm": 2.421875, + "learning_rate": 4.99965662152461e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.8666711002588272, + "num_tokens": 26023048.0, + "step": 243 + }, + { + "epoch": 0.556442417331813, + "grad_norm": 2.046875, + "learning_rate": 4.999624696638725e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.869884267449379, + "num_tokens": 26130137.0, + "step": 244 + }, + { + "epoch": 0.5587229190421893, + "grad_norm": 2.765625, + "learning_rate": 4.999591353014119e-06, + "loss": 0.6488, + "mean_token_accuracy": 0.8630170971155167, + "num_tokens": 26237107.0, + "step": 245 + }, + { + "epoch": 0.5610034207525656, + "grad_norm": 3.09375, + "learning_rate": 4.999556590669718e-06, + "loss": 0.6274, + "mean_token_accuracy": 0.8699584752321243, + "num_tokens": 26344577.0, + "step": 246 + }, + { + "epoch": 0.5632839224629419, + "grad_norm": 2.34375, + "learning_rate": 4.999520409625253e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.8654352575540543, + "num_tokens": 26451583.0, + "step": 247 + }, + { + "epoch": 0.5655644241733181, + "grad_norm": 2.21875, + "learning_rate": 4.999482809901257e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.8594978898763657, + "num_tokens": 26558350.0, + "step": 248 + }, + { + "epoch": 0.5678449258836944, + "grad_norm": 2.265625, + "learning_rate": 4.999443791519074e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.8702575117349625, + "num_tokens": 26665344.0, + "step": 249 + }, + { + "epoch": 0.5701254275940707, + "grad_norm": 3.03125, + "learning_rate": 4.999403354500847e-06, + "loss": 0.6382, + "mean_token_accuracy": 0.8670401573181152, + "num_tokens": 26772368.0, + "step": 250 + }, + { + "epoch": 0.572405929304447, + "grad_norm": 2.1875, + "learning_rate": 4.99936149886953e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.8696161508560181, + "num_tokens": 26879565.0, + "step": 251 + }, + { + "epoch": 0.5746864310148233, + "grad_norm": 2.734375, + "learning_rate": 4.999318224648878e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.8677998781204224, + "num_tokens": 26986810.0, + "step": 252 + }, + { + "epoch": 0.5769669327251995, + "grad_norm": 2.53125, + "learning_rate": 4.999273531863453e-06, + "loss": 0.6261, + "mean_token_accuracy": 0.8713762909173965, + "num_tokens": 27094179.0, + "step": 253 + }, + { + "epoch": 0.5792474344355758, + "grad_norm": 2.65625, + "learning_rate": 4.999227420538622e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.8648567199707031, + "num_tokens": 27201502.0, + "step": 254 + }, + { + "epoch": 0.5815279361459521, + "grad_norm": 2.25, + "learning_rate": 4.999179890700555e-06, + "loss": 0.663, + "mean_token_accuracy": 0.8646685779094696, + "num_tokens": 27308101.0, + "step": 255 + }, + { + "epoch": 0.5838084378563284, + "grad_norm": 4.6875, + "learning_rate": 4.999130942376232e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.8694456964731216, + "num_tokens": 27414923.0, + "step": 256 + }, + { + "epoch": 0.5860889395667047, + "grad_norm": 2.609375, + "learning_rate": 4.999080575593433e-06, + "loss": 0.6417, + "mean_token_accuracy": 0.8683536648750305, + "num_tokens": 27521506.0, + "step": 257 + }, + { + "epoch": 0.5883694412770809, + "grad_norm": 2.546875, + "learning_rate": 4.999028790380746e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.8628116995096207, + "num_tokens": 27628384.0, + "step": 258 + }, + { + "epoch": 0.5906499429874572, + "grad_norm": 2.203125, + "learning_rate": 4.9989755867675635e-06, + "loss": 0.638, + "mean_token_accuracy": 0.8686887472867966, + "num_tokens": 27735457.0, + "step": 259 + }, + { + "epoch": 0.5929304446978335, + "grad_norm": 2.234375, + "learning_rate": 4.998920964784082e-06, + "loss": 0.6394, + "mean_token_accuracy": 0.868531346321106, + "num_tokens": 27842440.0, + "step": 260 + }, + { + "epoch": 0.5952109464082098, + "grad_norm": 2.578125, + "learning_rate": 4.998864924461305e-06, + "loss": 0.6169, + "mean_token_accuracy": 0.8760952204465866, + "num_tokens": 27949150.0, + "step": 261 + }, + { + "epoch": 0.5974914481185861, + "grad_norm": 2.75, + "learning_rate": 4.998807465831039e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.8662375062704086, + "num_tokens": 28056218.0, + "step": 262 + }, + { + "epoch": 0.5997719498289624, + "grad_norm": 3.0, + "learning_rate": 4.998748588925897e-06, + "loss": 0.6565, + "mean_token_accuracy": 0.8636495620012283, + "num_tokens": 28163194.0, + "step": 263 + }, + { + "epoch": 0.6020524515393386, + "grad_norm": 2.25, + "learning_rate": 4.998688293779297e-06, + "loss": 0.621, + "mean_token_accuracy": 0.8721827417612076, + "num_tokens": 28270611.0, + "step": 264 + }, + { + "epoch": 0.6043329532497149, + "grad_norm": 4.4375, + "learning_rate": 4.998626580425459e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.8628446161746979, + "num_tokens": 28377681.0, + "step": 265 + }, + { + "epoch": 0.6066134549600912, + "grad_norm": 2.78125, + "learning_rate": 4.998563448899413e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8687665909528732, + "num_tokens": 28484960.0, + "step": 266 + }, + { + "epoch": 0.6088939566704675, + "grad_norm": 2.96875, + "learning_rate": 4.998498899236989e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.8647327572107315, + "num_tokens": 28592263.0, + "step": 267 + }, + { + "epoch": 0.6111744583808438, + "grad_norm": 2.4375, + "learning_rate": 4.998432931474825e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.8670562505722046, + "num_tokens": 28699389.0, + "step": 268 + }, + { + "epoch": 0.61345496009122, + "grad_norm": 2.765625, + "learning_rate": 4.998365545650365e-06, + "loss": 0.6289, + "mean_token_accuracy": 0.8697399348020554, + "num_tokens": 28807311.0, + "step": 269 + }, + { + "epoch": 0.6157354618015963, + "grad_norm": 6.0, + "learning_rate": 4.998296741801852e-06, + "loss": 0.657, + "mean_token_accuracy": 0.8628436028957367, + "num_tokens": 28913892.0, + "step": 270 + }, + { + "epoch": 0.6180159635119726, + "grad_norm": 3.40625, + "learning_rate": 4.998226519968341e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.8716912418603897, + "num_tokens": 29020775.0, + "step": 271 + }, + { + "epoch": 0.6202964652223489, + "grad_norm": 4.0, + "learning_rate": 4.998154880189688e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.8669329136610031, + "num_tokens": 29127842.0, + "step": 272 + }, + { + "epoch": 0.6225769669327252, + "grad_norm": 2.484375, + "learning_rate": 4.998081822506552e-06, + "loss": 0.643, + "mean_token_accuracy": 0.8676059246063232, + "num_tokens": 29235156.0, + "step": 273 + }, + { + "epoch": 0.6248574686431014, + "grad_norm": 3.734375, + "learning_rate": 4.998007346960402e-06, + "loss": 0.6382, + "mean_token_accuracy": 0.8684564977884293, + "num_tokens": 29342694.0, + "step": 274 + }, + { + "epoch": 0.6271379703534777, + "grad_norm": 2.375, + "learning_rate": 4.997931453593507e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.8661504536867142, + "num_tokens": 29449934.0, + "step": 275 + }, + { + "epoch": 0.629418472063854, + "grad_norm": 2.71875, + "learning_rate": 4.997854142448944e-06, + "loss": 0.632, + "mean_token_accuracy": 0.8716614693403244, + "num_tokens": 29557007.0, + "step": 276 + }, + { + "epoch": 0.6316989737742303, + "grad_norm": 2.65625, + "learning_rate": 4.997775413570593e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.8713774085044861, + "num_tokens": 29664526.0, + "step": 277 + }, + { + "epoch": 0.6339794754846066, + "grad_norm": 3.28125, + "learning_rate": 4.997695267003139e-06, + "loss": 0.6231, + "mean_token_accuracy": 0.8719380050897598, + "num_tokens": 29771396.0, + "step": 278 + }, + { + "epoch": 0.636259977194983, + "grad_norm": 2.5, + "learning_rate": 4.99761370279207e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.8700685054063797, + "num_tokens": 29877736.0, + "step": 279 + }, + { + "epoch": 0.6385404789053591, + "grad_norm": 3.703125, + "learning_rate": 4.997530720983682e-06, + "loss": 0.6331, + "mean_token_accuracy": 0.8674991726875305, + "num_tokens": 29985140.0, + "step": 280 + }, + { + "epoch": 0.6408209806157354, + "grad_norm": 3.34375, + "learning_rate": 4.9974463216250735e-06, + "loss": 0.6618, + "mean_token_accuracy": 0.8660477548837662, + "num_tokens": 30092099.0, + "step": 281 + }, + { + "epoch": 0.6431014823261118, + "grad_norm": 3.421875, + "learning_rate": 4.997360504764148e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.8705120533704758, + "num_tokens": 30198927.0, + "step": 282 + }, + { + "epoch": 0.645381984036488, + "grad_norm": 2.25, + "learning_rate": 4.997273270449614e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.8727722465991974, + "num_tokens": 30306090.0, + "step": 283 + }, + { + "epoch": 0.6476624857468644, + "grad_norm": 2.875, + "learning_rate": 4.997184618730983e-06, + "loss": 0.648, + "mean_token_accuracy": 0.8658933788537979, + "num_tokens": 30413636.0, + "step": 284 + }, + { + "epoch": 0.6499429874572406, + "grad_norm": 2.5625, + "learning_rate": 4.997094549658572e-06, + "loss": 0.6261, + "mean_token_accuracy": 0.8701870143413544, + "num_tokens": 30521647.0, + "step": 285 + }, + { + "epoch": 0.6522234891676169, + "grad_norm": 2.46875, + "learning_rate": 4.997003063283503e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.8652370274066925, + "num_tokens": 30628370.0, + "step": 286 + }, + { + "epoch": 0.6545039908779932, + "grad_norm": 3.296875, + "learning_rate": 4.996910159657703e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.8690200746059418, + "num_tokens": 30735820.0, + "step": 287 + }, + { + "epoch": 0.6567844925883695, + "grad_norm": 2.34375, + "learning_rate": 4.996815838833899e-06, + "loss": 0.6356, + "mean_token_accuracy": 0.8680445849895477, + "num_tokens": 30843566.0, + "step": 288 + }, + { + "epoch": 0.6590649942987458, + "grad_norm": 2.90625, + "learning_rate": 4.99672010086563e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8737114071846008, + "num_tokens": 30951346.0, + "step": 289 + }, + { + "epoch": 0.661345496009122, + "grad_norm": 3.125, + "learning_rate": 4.996622945807231e-06, + "loss": 0.6135, + "mean_token_accuracy": 0.8741404414176941, + "num_tokens": 31058663.0, + "step": 290 + }, + { + "epoch": 0.6636259977194983, + "grad_norm": 2.078125, + "learning_rate": 4.996524373713848e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.8712268769741058, + "num_tokens": 31166704.0, + "step": 291 + }, + { + "epoch": 0.6659064994298746, + "grad_norm": 5.09375, + "learning_rate": 4.996424384641428e-06, + "loss": 0.6119, + "mean_token_accuracy": 0.874211773276329, + "num_tokens": 31273610.0, + "step": 292 + }, + { + "epoch": 0.6681870011402509, + "grad_norm": 6.875, + "learning_rate": 4.996322978646722e-06, + "loss": 0.6088, + "mean_token_accuracy": 0.8734241724014282, + "num_tokens": 31380662.0, + "step": 293 + }, + { + "epoch": 0.6704675028506272, + "grad_norm": 2.796875, + "learning_rate": 4.996220155787287e-06, + "loss": 0.6526, + "mean_token_accuracy": 0.8677922487258911, + "num_tokens": 31487775.0, + "step": 294 + }, + { + "epoch": 0.6727480045610034, + "grad_norm": 2.640625, + "learning_rate": 4.996115916121483e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.8704153895378113, + "num_tokens": 31595052.0, + "step": 295 + }, + { + "epoch": 0.6750285062713797, + "grad_norm": 5.28125, + "learning_rate": 4.996010259708475e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.8663373440504074, + "num_tokens": 31702051.0, + "step": 296 + }, + { + "epoch": 0.677309007981756, + "grad_norm": 2.75, + "learning_rate": 4.99590318660823e-06, + "loss": 0.6066, + "mean_token_accuracy": 0.87331423163414, + "num_tokens": 31809437.0, + "step": 297 + }, + { + "epoch": 0.6795895096921323, + "grad_norm": 2.796875, + "learning_rate": 4.9957946968815215e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.8677855134010315, + "num_tokens": 31916405.0, + "step": 298 + }, + { + "epoch": 0.6818700114025086, + "grad_norm": 3.3125, + "learning_rate": 4.995684790589927e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8677714616060257, + "num_tokens": 32023457.0, + "step": 299 + }, + { + "epoch": 0.6841505131128849, + "grad_norm": 3.1875, + "learning_rate": 4.995573467795825e-06, + "loss": 0.6213, + "mean_token_accuracy": 0.8719353079795837, + "num_tokens": 32130881.0, + "step": 300 + }, + { + "epoch": 0.6864310148232611, + "grad_norm": 4.3125, + "learning_rate": 4.995460728562403e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.8694345206022263, + "num_tokens": 32237937.0, + "step": 301 + }, + { + "epoch": 0.6887115165336374, + "grad_norm": 3.046875, + "learning_rate": 4.9953465729536475e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.8696666061878204, + "num_tokens": 32345103.0, + "step": 302 + }, + { + "epoch": 0.6909920182440137, + "grad_norm": 2.265625, + "learning_rate": 4.995231001034352e-06, + "loss": 0.6148, + "mean_token_accuracy": 0.868248924612999, + "num_tokens": 32451683.0, + "step": 303 + }, + { + "epoch": 0.69327251995439, + "grad_norm": 2.140625, + "learning_rate": 4.995114012870112e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.8688762336969376, + "num_tokens": 32558767.0, + "step": 304 + }, + { + "epoch": 0.6955530216647663, + "grad_norm": 3.6875, + "learning_rate": 4.99499560852733e-06, + "loss": 0.629, + "mean_token_accuracy": 0.8722851425409317, + "num_tokens": 32665998.0, + "step": 305 + }, + { + "epoch": 0.6978335233751425, + "grad_norm": 5.0, + "learning_rate": 4.994875788073207e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.8682472556829453, + "num_tokens": 32773798.0, + "step": 306 + }, + { + "epoch": 0.7001140250855188, + "grad_norm": 3.5, + "learning_rate": 4.994754551575752e-06, + "loss": 0.65, + "mean_token_accuracy": 0.8666471391916275, + "num_tokens": 32880403.0, + "step": 307 + }, + { + "epoch": 0.7023945267958951, + "grad_norm": 2.328125, + "learning_rate": 4.994631899103777e-06, + "loss": 0.6419, + "mean_token_accuracy": 0.867155522108078, + "num_tokens": 32987552.0, + "step": 308 + }, + { + "epoch": 0.7046750285062714, + "grad_norm": 2.375, + "learning_rate": 4.9945078307268974e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.8653433471918106, + "num_tokens": 33094230.0, + "step": 309 + }, + { + "epoch": 0.7069555302166477, + "grad_norm": 4.84375, + "learning_rate": 4.994382346515531e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.8708781599998474, + "num_tokens": 33201782.0, + "step": 310 + }, + { + "epoch": 0.7092360319270239, + "grad_norm": 4.15625, + "learning_rate": 4.9942554465409e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.8667383790016174, + "num_tokens": 33308197.0, + "step": 311 + }, + { + "epoch": 0.7115165336374002, + "grad_norm": 3.296875, + "learning_rate": 4.994127130875032e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8729591369628906, + "num_tokens": 33415348.0, + "step": 312 + }, + { + "epoch": 0.7137970353477765, + "grad_norm": 2.078125, + "learning_rate": 4.993997399590755e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8707993477582932, + "num_tokens": 33522331.0, + "step": 313 + }, + { + "epoch": 0.7160775370581528, + "grad_norm": 4.25, + "learning_rate": 4.993866252761702e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.8653054535388947, + "num_tokens": 33629329.0, + "step": 314 + }, + { + "epoch": 0.7183580387685291, + "grad_norm": 4.1875, + "learning_rate": 4.993733690462311e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.8681600391864777, + "num_tokens": 33736412.0, + "step": 315 + }, + { + "epoch": 0.7206385404789054, + "grad_norm": 1.953125, + "learning_rate": 4.99359971276782e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.8719700872898102, + "num_tokens": 33843463.0, + "step": 316 + }, + { + "epoch": 0.7229190421892816, + "grad_norm": 1.8203125, + "learning_rate": 4.993464319754273e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.8726183176040649, + "num_tokens": 33950234.0, + "step": 317 + }, + { + "epoch": 0.7251995438996579, + "grad_norm": 2.9375, + "learning_rate": 4.993327511498516e-06, + "loss": 0.625, + "mean_token_accuracy": 0.8715388774871826, + "num_tokens": 34057495.0, + "step": 318 + }, + { + "epoch": 0.7274800456100342, + "grad_norm": 2.4375, + "learning_rate": 4.9931892880782e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.8697730153799057, + "num_tokens": 34164527.0, + "step": 319 + }, + { + "epoch": 0.7297605473204105, + "grad_norm": 2.75, + "learning_rate": 4.993049649571775e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.8663023114204407, + "num_tokens": 34271517.0, + "step": 320 + }, + { + "epoch": 0.7320410490307868, + "grad_norm": 4.1875, + "learning_rate": 4.992908596058501e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.8702614009380341, + "num_tokens": 34378878.0, + "step": 321 + }, + { + "epoch": 0.734321550741163, + "grad_norm": 2.640625, + "learning_rate": 4.992766127618434e-06, + "loss": 0.6261, + "mean_token_accuracy": 0.8703981339931488, + "num_tokens": 34485783.0, + "step": 322 + }, + { + "epoch": 0.7366020524515393, + "grad_norm": 2.5, + "learning_rate": 4.992622244332439e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.8647294193506241, + "num_tokens": 34592416.0, + "step": 323 + }, + { + "epoch": 0.7388825541619156, + "grad_norm": 1.953125, + "learning_rate": 4.992476946282179e-06, + "loss": 0.6313, + "mean_token_accuracy": 0.8711864650249481, + "num_tokens": 34699693.0, + "step": 324 + }, + { + "epoch": 0.7411630558722919, + "grad_norm": 3.46875, + "learning_rate": 4.992330233550124e-06, + "loss": 0.6155, + "mean_token_accuracy": 0.8703418523073196, + "num_tokens": 34806837.0, + "step": 325 + }, + { + "epoch": 0.7434435575826682, + "grad_norm": 3.140625, + "learning_rate": 4.9921821062195445e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.8661866039037704, + "num_tokens": 34913354.0, + "step": 326 + }, + { + "epoch": 0.7457240592930444, + "grad_norm": 3.015625, + "learning_rate": 4.9920325643745145e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.8691374510526657, + "num_tokens": 35020432.0, + "step": 327 + }, + { + "epoch": 0.7480045610034207, + "grad_norm": 6.3125, + "learning_rate": 4.991881608099912e-06, + "loss": 0.6274, + "mean_token_accuracy": 0.8629052639007568, + "num_tokens": 35127877.0, + "step": 328 + }, + { + "epoch": 0.750285062713797, + "grad_norm": 4.5, + "learning_rate": 4.991729237481417e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.8679073601961136, + "num_tokens": 35234847.0, + "step": 329 + }, + { + "epoch": 0.7525655644241733, + "grad_norm": 2.21875, + "learning_rate": 4.991575452605511e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.8713287711143494, + "num_tokens": 35342056.0, + "step": 330 + }, + { + "epoch": 0.7548460661345496, + "grad_norm": 2.6875, + "learning_rate": 4.9914202535594795e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.8670705258846283, + "num_tokens": 35448907.0, + "step": 331 + }, + { + "epoch": 0.7571265678449259, + "grad_norm": 3.890625, + "learning_rate": 4.991263640431411e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.8704476803541183, + "num_tokens": 35556170.0, + "step": 332 + }, + { + "epoch": 0.7594070695553021, + "grad_norm": 5.5, + "learning_rate": 4.9911056133101965e-06, + "loss": 0.6195, + "mean_token_accuracy": 0.8712238371372223, + "num_tokens": 35662971.0, + "step": 333 + }, + { + "epoch": 0.7616875712656784, + "grad_norm": 3.171875, + "learning_rate": 4.990946172285528e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.8710373938083649, + "num_tokens": 35769814.0, + "step": 334 + }, + { + "epoch": 0.7639680729760547, + "grad_norm": 2.140625, + "learning_rate": 4.990785317447901e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8746102750301361, + "num_tokens": 35877019.0, + "step": 335 + }, + { + "epoch": 0.766248574686431, + "grad_norm": 2.953125, + "learning_rate": 4.990623048888615e-06, + "loss": 0.627, + "mean_token_accuracy": 0.8735091537237167, + "num_tokens": 35983784.0, + "step": 336 + }, + { + "epoch": 0.7685290763968073, + "grad_norm": 3.9375, + "learning_rate": 4.9904593666997704e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.8641745299100876, + "num_tokens": 36090599.0, + "step": 337 + }, + { + "epoch": 0.7708095781071835, + "grad_norm": 3.875, + "learning_rate": 4.990294270974268e-06, + "loss": 0.6328, + "mean_token_accuracy": 0.8708094358444214, + "num_tokens": 36197751.0, + "step": 338 + }, + { + "epoch": 0.7730900798175598, + "grad_norm": 2.078125, + "learning_rate": 4.990127761805816e-06, + "loss": 0.6345, + "mean_token_accuracy": 0.8706918656826019, + "num_tokens": 36304049.0, + "step": 339 + }, + { + "epoch": 0.7753705815279361, + "grad_norm": 1.9375, + "learning_rate": 4.989959839288919e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8699767887592316, + "num_tokens": 36411843.0, + "step": 340 + }, + { + "epoch": 0.7776510832383124, + "grad_norm": 2.765625, + "learning_rate": 4.989790503518888e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.8728293031454086, + "num_tokens": 36518890.0, + "step": 341 + }, + { + "epoch": 0.7799315849486887, + "grad_norm": 5.875, + "learning_rate": 4.9896197545918345e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.8697379231452942, + "num_tokens": 36626348.0, + "step": 342 + }, + { + "epoch": 0.7822120866590649, + "grad_norm": 1.8984375, + "learning_rate": 4.989447592604673e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8770763874053955, + "num_tokens": 36733926.0, + "step": 343 + }, + { + "epoch": 0.7844925883694412, + "grad_norm": 3.515625, + "learning_rate": 4.989274017655117e-06, + "loss": 0.6, + "mean_token_accuracy": 0.8766891658306122, + "num_tokens": 36841051.0, + "step": 344 + }, + { + "epoch": 0.7867730900798175, + "grad_norm": 3.078125, + "learning_rate": 4.989099029841687e-06, + "loss": 0.6305, + "mean_token_accuracy": 0.8703635185956955, + "num_tokens": 36947674.0, + "step": 345 + }, + { + "epoch": 0.7890535917901939, + "grad_norm": 2.109375, + "learning_rate": 4.988922629263701e-06, + "loss": 0.6234, + "mean_token_accuracy": 0.87026646733284, + "num_tokens": 37054412.0, + "step": 346 + }, + { + "epoch": 0.7913340935005702, + "grad_norm": 3.0625, + "learning_rate": 4.988744816021283e-06, + "loss": 0.64, + "mean_token_accuracy": 0.8698112666606903, + "num_tokens": 37162022.0, + "step": 347 + }, + { + "epoch": 0.7936145952109465, + "grad_norm": 2.15625, + "learning_rate": 4.988565590215352e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8709748089313507, + "num_tokens": 37269040.0, + "step": 348 + }, + { + "epoch": 0.7958950969213227, + "grad_norm": 4.15625, + "learning_rate": 4.9883849519476364e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.8744599372148514, + "num_tokens": 37376445.0, + "step": 349 + }, + { + "epoch": 0.798175598631699, + "grad_norm": 2.984375, + "learning_rate": 4.988202901320663e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.8723867684602737, + "num_tokens": 37483781.0, + "step": 350 + }, + { + "epoch": 0.8004561003420753, + "grad_norm": 5.0625, + "learning_rate": 4.988019438437759e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.8646156787872314, + "num_tokens": 37590388.0, + "step": 351 + }, + { + "epoch": 0.8027366020524516, + "grad_norm": 3.359375, + "learning_rate": 4.987834563403055e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.8731140941381454, + "num_tokens": 37698197.0, + "step": 352 + }, + { + "epoch": 0.8050171037628279, + "grad_norm": 3.765625, + "learning_rate": 4.987648276321482e-06, + "loss": 0.658, + "mean_token_accuracy": 0.8623346835374832, + "num_tokens": 37804721.0, + "step": 353 + }, + { + "epoch": 0.8072976054732041, + "grad_norm": 5.15625, + "learning_rate": 4.987460577298774e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.8700351715087891, + "num_tokens": 37912089.0, + "step": 354 + }, + { + "epoch": 0.8095781071835804, + "grad_norm": 5.53125, + "learning_rate": 4.9872714664414635e-06, + "loss": 0.6288, + "mean_token_accuracy": 0.8672404289245605, + "num_tokens": 38018353.0, + "step": 355 + }, + { + "epoch": 0.8118586088939567, + "grad_norm": 5.0, + "learning_rate": 4.987080943856887e-06, + "loss": 0.6396, + "mean_token_accuracy": 0.8685039430856705, + "num_tokens": 38124920.0, + "step": 356 + }, + { + "epoch": 0.814139110604333, + "grad_norm": 2.546875, + "learning_rate": 4.986889009653183e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.8747462034225464, + "num_tokens": 38232424.0, + "step": 357 + }, + { + "epoch": 0.8164196123147093, + "grad_norm": 4.5625, + "learning_rate": 4.986695663939288e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.8709569126367569, + "num_tokens": 38339566.0, + "step": 358 + }, + { + "epoch": 0.8187001140250855, + "grad_norm": 4.21875, + "learning_rate": 4.986500906824942e-06, + "loss": 0.6181, + "mean_token_accuracy": 0.8750324100255966, + "num_tokens": 38446417.0, + "step": 359 + }, + { + "epoch": 0.8209806157354618, + "grad_norm": 5.5, + "learning_rate": 4.986304738420684e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.8724595308303833, + "num_tokens": 38553490.0, + "step": 360 + }, + { + "epoch": 0.8232611174458381, + "grad_norm": 3.28125, + "learning_rate": 4.9861071588378565e-06, + "loss": 0.644, + "mean_token_accuracy": 0.8690180629491806, + "num_tokens": 38660341.0, + "step": 361 + }, + { + "epoch": 0.8255416191562144, + "grad_norm": 2.203125, + "learning_rate": 4.985908168188602e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8723935782909393, + "num_tokens": 38767171.0, + "step": 362 + }, + { + "epoch": 0.8278221208665907, + "grad_norm": 2.390625, + "learning_rate": 4.985707766585865e-06, + "loss": 0.6186, + "mean_token_accuracy": 0.8713973164558411, + "num_tokens": 38873913.0, + "step": 363 + }, + { + "epoch": 0.830102622576967, + "grad_norm": 2.75, + "learning_rate": 4.985505954143387e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.8741555064916611, + "num_tokens": 38981023.0, + "step": 364 + }, + { + "epoch": 0.8323831242873432, + "grad_norm": 1.8359375, + "learning_rate": 4.985302730975713e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.8745421171188354, + "num_tokens": 39088181.0, + "step": 365 + }, + { + "epoch": 0.8346636259977195, + "grad_norm": 2.65625, + "learning_rate": 4.9850980971981914e-06, + "loss": 0.6419, + "mean_token_accuracy": 0.870025172829628, + "num_tokens": 39195055.0, + "step": 366 + }, + { + "epoch": 0.8369441277080958, + "grad_norm": 3.375, + "learning_rate": 4.984892052926965e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.8648830950260162, + "num_tokens": 39302023.0, + "step": 367 + }, + { + "epoch": 0.8392246294184721, + "grad_norm": 1.9609375, + "learning_rate": 4.984684598278982e-06, + "loss": 0.6332, + "mean_token_accuracy": 0.8697508275508881, + "num_tokens": 39408616.0, + "step": 368 + }, + { + "epoch": 0.8415051311288484, + "grad_norm": 2.265625, + "learning_rate": 4.984475733371991e-06, + "loss": 0.649, + "mean_token_accuracy": 0.8658408671617508, + "num_tokens": 39515948.0, + "step": 369 + }, + { + "epoch": 0.8437856328392246, + "grad_norm": 3.265625, + "learning_rate": 4.984265458324538e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.8648363202810287, + "num_tokens": 39622628.0, + "step": 370 + }, + { + "epoch": 0.8460661345496009, + "grad_norm": 3.078125, + "learning_rate": 4.984053773255971e-06, + "loss": 0.6158, + "mean_token_accuracy": 0.8734622299671173, + "num_tokens": 39730107.0, + "step": 371 + }, + { + "epoch": 0.8483466362599772, + "grad_norm": 2.609375, + "learning_rate": 4.9838406782864394e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8723733127117157, + "num_tokens": 39837288.0, + "step": 372 + }, + { + "epoch": 0.8506271379703535, + "grad_norm": 1.9296875, + "learning_rate": 4.983626173536891e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.8771011680364609, + "num_tokens": 39944365.0, + "step": 373 + }, + { + "epoch": 0.8529076396807298, + "grad_norm": 2.140625, + "learning_rate": 4.983410259129075e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.8640440553426743, + "num_tokens": 40051377.0, + "step": 374 + }, + { + "epoch": 0.855188141391106, + "grad_norm": 2.203125, + "learning_rate": 4.983192935185539e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.8728392422199249, + "num_tokens": 40158071.0, + "step": 375 + }, + { + "epoch": 0.8574686431014823, + "grad_norm": 2.03125, + "learning_rate": 4.9829742018296335e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8718508183956146, + "num_tokens": 40265163.0, + "step": 376 + }, + { + "epoch": 0.8597491448118586, + "grad_norm": 2.046875, + "learning_rate": 4.9827540591855064e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.8694735020399094, + "num_tokens": 40372053.0, + "step": 377 + }, + { + "epoch": 0.8620296465222349, + "grad_norm": 2.8125, + "learning_rate": 4.9825325073781075e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.8715538680553436, + "num_tokens": 40479008.0, + "step": 378 + }, + { + "epoch": 0.8643101482326112, + "grad_norm": 3.15625, + "learning_rate": 4.982309546533184e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.866651251912117, + "num_tokens": 40585719.0, + "step": 379 + }, + { + "epoch": 0.8665906499429875, + "grad_norm": 2.078125, + "learning_rate": 4.982085176777285e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8743686825037003, + "num_tokens": 40692870.0, + "step": 380 + }, + { + "epoch": 0.8688711516533637, + "grad_norm": 4.75, + "learning_rate": 4.981859398237758e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.8692844212055206, + "num_tokens": 40800059.0, + "step": 381 + }, + { + "epoch": 0.87115165336374, + "grad_norm": 3.171875, + "learning_rate": 4.9816322110427505e-06, + "loss": 0.6292, + "mean_token_accuracy": 0.8697078227996826, + "num_tokens": 40907542.0, + "step": 382 + }, + { + "epoch": 0.8734321550741163, + "grad_norm": 3.734375, + "learning_rate": 4.98140361532121e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.871175691485405, + "num_tokens": 41014351.0, + "step": 383 + }, + { + "epoch": 0.8757126567844926, + "grad_norm": 4.21875, + "learning_rate": 4.981173611202883e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.8670124560594559, + "num_tokens": 41121154.0, + "step": 384 + }, + { + "epoch": 0.8779931584948689, + "grad_norm": 3.234375, + "learning_rate": 4.980942198818315e-06, + "loss": 0.63, + "mean_token_accuracy": 0.8680609911680222, + "num_tokens": 41228488.0, + "step": 385 + }, + { + "epoch": 0.8802736602052451, + "grad_norm": 4.96875, + "learning_rate": 4.980709378298851e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.8670986741781235, + "num_tokens": 41335487.0, + "step": 386 + }, + { + "epoch": 0.8825541619156214, + "grad_norm": 2.328125, + "learning_rate": 4.980475149776636e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.8722383230924606, + "num_tokens": 41442292.0, + "step": 387 + }, + { + "epoch": 0.8848346636259977, + "grad_norm": 2.421875, + "learning_rate": 4.980239513384614e-06, + "loss": 0.6291, + "mean_token_accuracy": 0.8699522018432617, + "num_tokens": 41549340.0, + "step": 388 + }, + { + "epoch": 0.887115165336374, + "grad_norm": 7.5, + "learning_rate": 4.980002469256527e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8692153990268707, + "num_tokens": 41656335.0, + "step": 389 + }, + { + "epoch": 0.8893956670467503, + "grad_norm": 3.171875, + "learning_rate": 4.979764017526916e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.8701380044221878, + "num_tokens": 41763377.0, + "step": 390 + }, + { + "epoch": 0.8916761687571265, + "grad_norm": 2.09375, + "learning_rate": 4.979524158331123e-06, + "loss": 0.635, + "mean_token_accuracy": 0.8717087209224701, + "num_tokens": 41870195.0, + "step": 391 + }, + { + "epoch": 0.8939566704675028, + "grad_norm": 2.09375, + "learning_rate": 4.979282891805287e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8729406297206879, + "num_tokens": 41977586.0, + "step": 392 + }, + { + "epoch": 0.8962371721778791, + "grad_norm": 2.078125, + "learning_rate": 4.979040218086345e-06, + "loss": 0.6338, + "mean_token_accuracy": 0.8731931746006012, + "num_tokens": 42084265.0, + "step": 393 + }, + { + "epoch": 0.8985176738882554, + "grad_norm": 2.109375, + "learning_rate": 4.978796137312036e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.8713531494140625, + "num_tokens": 42191215.0, + "step": 394 + }, + { + "epoch": 0.9007981755986317, + "grad_norm": 2.84375, + "learning_rate": 4.978550649620894e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.868993267416954, + "num_tokens": 42298587.0, + "step": 395 + }, + { + "epoch": 0.9030786773090079, + "grad_norm": 2.140625, + "learning_rate": 4.978303755152254e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8749285191297531, + "num_tokens": 42406175.0, + "step": 396 + }, + { + "epoch": 0.9053591790193842, + "grad_norm": 4.3125, + "learning_rate": 4.978055454046247e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.8721934705972672, + "num_tokens": 42513138.0, + "step": 397 + }, + { + "epoch": 0.9076396807297605, + "grad_norm": 2.953125, + "learning_rate": 4.977805746443807e-06, + "loss": 0.6093, + "mean_token_accuracy": 0.87620410323143, + "num_tokens": 42620098.0, + "step": 398 + }, + { + "epoch": 0.9099201824401368, + "grad_norm": 2.0625, + "learning_rate": 4.9775546324866596e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8835956156253815, + "num_tokens": 42727074.0, + "step": 399 + }, + { + "epoch": 0.9122006841505131, + "grad_norm": 4.28125, + "learning_rate": 4.977302112317334e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.8638210296630859, + "num_tokens": 42834015.0, + "step": 400 + }, + { + "epoch": 0.9144811858608894, + "grad_norm": 5.78125, + "learning_rate": 4.977048186079155e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8687157332897186, + "num_tokens": 42940807.0, + "step": 401 + }, + { + "epoch": 0.9167616875712656, + "grad_norm": 3.328125, + "learning_rate": 4.976792853916248e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.8687217086553574, + "num_tokens": 43047379.0, + "step": 402 + }, + { + "epoch": 0.9190421892816419, + "grad_norm": 4.625, + "learning_rate": 4.9765361159735335e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.8750667423009872, + "num_tokens": 43154795.0, + "step": 403 + }, + { + "epoch": 0.9213226909920182, + "grad_norm": 6.25, + "learning_rate": 4.97627797239673e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.8691485226154327, + "num_tokens": 43261640.0, + "step": 404 + }, + { + "epoch": 0.9236031927023945, + "grad_norm": 2.96875, + "learning_rate": 4.976018423332357e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.875212773680687, + "num_tokens": 43368543.0, + "step": 405 + }, + { + "epoch": 0.9258836944127709, + "grad_norm": 2.890625, + "learning_rate": 4.975757468927727e-06, + "loss": 0.6041, + "mean_token_accuracy": 0.8764694184064865, + "num_tokens": 43475828.0, + "step": 406 + }, + { + "epoch": 0.928164196123147, + "grad_norm": 4.1875, + "learning_rate": 4.975495109330954e-06, + "loss": 0.6317, + "mean_token_accuracy": 0.8676637560129166, + "num_tokens": 43582569.0, + "step": 407 + }, + { + "epoch": 0.9304446978335233, + "grad_norm": 8.4375, + "learning_rate": 4.97523134469095e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.8702896982431412, + "num_tokens": 43689655.0, + "step": 408 + }, + { + "epoch": 0.9327251995438997, + "grad_norm": 2.875, + "learning_rate": 4.97496617515742e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8774754852056503, + "num_tokens": 43797009.0, + "step": 409 + }, + { + "epoch": 0.935005701254276, + "grad_norm": 2.40625, + "learning_rate": 4.974699600880869e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8728543817996979, + "num_tokens": 43904821.0, + "step": 410 + }, + { + "epoch": 0.9372862029646523, + "grad_norm": 3.734375, + "learning_rate": 4.974431622012601e-06, + "loss": 0.6565, + "mean_token_accuracy": 0.866395503282547, + "num_tokens": 44011564.0, + "step": 411 + }, + { + "epoch": 0.9395667046750285, + "grad_norm": 2.015625, + "learning_rate": 4.974162238704716e-06, + "loss": 0.594, + "mean_token_accuracy": 0.8790825754404068, + "num_tokens": 44118685.0, + "step": 412 + }, + { + "epoch": 0.9418472063854048, + "grad_norm": 4.4375, + "learning_rate": 4.973891451110109e-06, + "loss": 0.6196, + "mean_token_accuracy": 0.8710049241781235, + "num_tokens": 44225866.0, + "step": 413 + }, + { + "epoch": 0.9441277080957811, + "grad_norm": 5.15625, + "learning_rate": 4.973619259382475e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.8691826015710831, + "num_tokens": 44333305.0, + "step": 414 + }, + { + "epoch": 0.9464082098061574, + "grad_norm": 3.46875, + "learning_rate": 4.973345663676305e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.8764727264642715, + "num_tokens": 44439926.0, + "step": 415 + }, + { + "epoch": 0.9486887115165337, + "grad_norm": 3.078125, + "learning_rate": 4.973070664146885e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.871231347322464, + "num_tokens": 44547096.0, + "step": 416 + }, + { + "epoch": 0.95096921322691, + "grad_norm": 4.25, + "learning_rate": 4.972794260950301e-06, + "loss": 0.6118, + "mean_token_accuracy": 0.8720909953117371, + "num_tokens": 44654153.0, + "step": 417 + }, + { + "epoch": 0.9532497149372862, + "grad_norm": 5.15625, + "learning_rate": 4.972516454243433e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.8684164136648178, + "num_tokens": 44761805.0, + "step": 418 + }, + { + "epoch": 0.9555302166476625, + "grad_norm": 6.25, + "learning_rate": 4.972237244183961e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.8743065893650055, + "num_tokens": 44869180.0, + "step": 419 + }, + { + "epoch": 0.9578107183580388, + "grad_norm": 4.3125, + "learning_rate": 4.971956630930356e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.8736863434314728, + "num_tokens": 44976595.0, + "step": 420 + }, + { + "epoch": 0.9600912200684151, + "grad_norm": 2.21875, + "learning_rate": 4.971674614641891e-06, + "loss": 0.618, + "mean_token_accuracy": 0.8725884109735489, + "num_tokens": 45083791.0, + "step": 421 + }, + { + "epoch": 0.9623717217787914, + "grad_norm": 3.609375, + "learning_rate": 4.971391195478632e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.875871405005455, + "num_tokens": 45190655.0, + "step": 422 + }, + { + "epoch": 0.9646522234891676, + "grad_norm": 4.0, + "learning_rate": 4.971106373601443e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.8689168095588684, + "num_tokens": 45297243.0, + "step": 423 + }, + { + "epoch": 0.9669327251995439, + "grad_norm": 3.5625, + "learning_rate": 4.9708201491719825e-06, + "loss": 0.623, + "mean_token_accuracy": 0.8707065731287003, + "num_tokens": 45404013.0, + "step": 424 + }, + { + "epoch": 0.9692132269099202, + "grad_norm": 2.46875, + "learning_rate": 4.9705325223527055e-06, + "loss": 0.6136, + "mean_token_accuracy": 0.8755741715431213, + "num_tokens": 45511141.0, + "step": 425 + }, + { + "epoch": 0.9714937286202965, + "grad_norm": 2.578125, + "learning_rate": 4.970243493306865e-06, + "loss": 0.6068, + "mean_token_accuracy": 0.8742416203022003, + "num_tokens": 45618184.0, + "step": 426 + }, + { + "epoch": 0.9737742303306728, + "grad_norm": 3.78125, + "learning_rate": 4.969953062198508e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.8716663718223572, + "num_tokens": 45725266.0, + "step": 427 + }, + { + "epoch": 0.976054732041049, + "grad_norm": 2.359375, + "learning_rate": 4.969661229192477e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8720242828130722, + "num_tokens": 45832262.0, + "step": 428 + }, + { + "epoch": 0.9783352337514253, + "grad_norm": 2.1875, + "learning_rate": 4.969367994454412e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8772017359733582, + "num_tokens": 45939904.0, + "step": 429 + }, + { + "epoch": 0.9806157354618016, + "grad_norm": 1.9609375, + "learning_rate": 4.9690733581507445e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.8746745586395264, + "num_tokens": 46046844.0, + "step": 430 + }, + { + "epoch": 0.9828962371721779, + "grad_norm": 1.84375, + "learning_rate": 4.968777320448707e-06, + "loss": 0.622, + "mean_token_accuracy": 0.8727656751871109, + "num_tokens": 46154121.0, + "step": 431 + }, + { + "epoch": 0.9851767388825542, + "grad_norm": 2.953125, + "learning_rate": 4.9684798815163235e-06, + "loss": 0.6052, + "mean_token_accuracy": 0.8737929463386536, + "num_tokens": 46260970.0, + "step": 432 + }, + { + "epoch": 0.9874572405929305, + "grad_norm": 2.390625, + "learning_rate": 4.968181041522416e-06, + "loss": 0.6273, + "mean_token_accuracy": 0.8705646842718124, + "num_tokens": 46368205.0, + "step": 433 + }, + { + "epoch": 0.9897377423033067, + "grad_norm": 2.171875, + "learning_rate": 4.967880800636599e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.8682654201984406, + "num_tokens": 46475197.0, + "step": 434 + }, + { + "epoch": 0.992018244013683, + "grad_norm": 2.140625, + "learning_rate": 4.967579159029284e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.8732243329286575, + "num_tokens": 46582041.0, + "step": 435 + }, + { + "epoch": 0.9942987457240593, + "grad_norm": 3.28125, + "learning_rate": 4.9672761168716766e-06, + "loss": 0.6324, + "mean_token_accuracy": 0.8705407828092575, + "num_tokens": 46688998.0, + "step": 436 + }, + { + "epoch": 0.9965792474344356, + "grad_norm": 2.953125, + "learning_rate": 4.966971674335778e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.8775283247232437, + "num_tokens": 46795830.0, + "step": 437 + }, + { + "epoch": 0.9988597491448119, + "grad_norm": 2.125, + "learning_rate": 4.966665831594383e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8748981207609177, + "num_tokens": 46903068.0, + "step": 438 + }, + { + "epoch": 1.0, + "grad_norm": 7.34375, + "learning_rate": 4.966358588821084e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.8653521537780762, + "num_tokens": 46942232.0, + "step": 439 + }, + { + "epoch": 1.0022805017103762, + "grad_norm": 2.0, + "learning_rate": 4.966049946190265e-06, + "loss": 0.624, + "mean_token_accuracy": 0.8737205117940903, + "num_tokens": 47049296.0, + "step": 440 + }, + { + "epoch": 1.0022805017103762, + "eval_loss": 0.6221891045570374, + "eval_mean_token_accuracy": 0.8722115945453426, + "eval_num_tokens": 47049296.0, + "eval_runtime": 58.6597, + "eval_samples_per_second": 142.943, + "eval_steps_per_second": 4.483, + "step": 440 + }, + { + "epoch": 1.0045610034207526, + "grad_norm": 2.015625, + "learning_rate": 4.9657399038771045e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8777357935905457, + "num_tokens": 47156210.0, + "step": 441 + }, + { + "epoch": 1.0068415051311288, + "grad_norm": 2.4375, + "learning_rate": 4.965428462057578e-06, + "loss": 0.6234, + "mean_token_accuracy": 0.8698296397924423, + "num_tokens": 47263285.0, + "step": 442 + }, + { + "epoch": 1.0091220068415052, + "grad_norm": 4.34375, + "learning_rate": 4.965115620908453e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8760122805833817, + "num_tokens": 47370395.0, + "step": 443 + }, + { + "epoch": 1.0114025085518814, + "grad_norm": 3.65625, + "learning_rate": 4.964801380607293e-06, + "loss": 0.6211, + "mean_token_accuracy": 0.8716463297605515, + "num_tokens": 47477476.0, + "step": 444 + }, + { + "epoch": 1.0136830102622576, + "grad_norm": 3.25, + "learning_rate": 4.964485741332453e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8730912059545517, + "num_tokens": 47585481.0, + "step": 445 + }, + { + "epoch": 1.015963511972634, + "grad_norm": 2.5625, + "learning_rate": 4.964168703263086e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8801993578672409, + "num_tokens": 47693090.0, + "step": 446 + }, + { + "epoch": 1.0182440136830102, + "grad_norm": 5.8125, + "learning_rate": 4.963850266579136e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.8687308132648468, + "num_tokens": 47800542.0, + "step": 447 + }, + { + "epoch": 1.0205245153933866, + "grad_norm": 5.0, + "learning_rate": 4.963530431461341e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8741424828767776, + "num_tokens": 47907379.0, + "step": 448 + }, + { + "epoch": 1.0228050171037628, + "grad_norm": 5.15625, + "learning_rate": 4.963209198091232e-06, + "loss": 0.6295, + "mean_token_accuracy": 0.8697365671396255, + "num_tokens": 48014994.0, + "step": 449 + }, + { + "epoch": 1.025085518814139, + "grad_norm": 2.203125, + "learning_rate": 4.962886566651138e-06, + "loss": 0.6177, + "mean_token_accuracy": 0.873694971203804, + "num_tokens": 48122545.0, + "step": 450 + }, + { + "epoch": 1.0273660205245154, + "grad_norm": 8.5625, + "learning_rate": 4.962562537324176e-06, + "loss": 0.6066, + "mean_token_accuracy": 0.8789347857236862, + "num_tokens": 48230071.0, + "step": 451 + }, + { + "epoch": 1.0296465222348916, + "grad_norm": 4.5625, + "learning_rate": 4.96223711029426e-06, + "loss": 0.641, + "mean_token_accuracy": 0.8675385862588882, + "num_tokens": 48336830.0, + "step": 452 + }, + { + "epoch": 1.031927023945268, + "grad_norm": 5.90625, + "learning_rate": 4.961910285746094e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8765672594308853, + "num_tokens": 48443698.0, + "step": 453 + }, + { + "epoch": 1.0342075256556442, + "grad_norm": 2.6875, + "learning_rate": 4.9615820638651805e-06, + "loss": 0.642, + "mean_token_accuracy": 0.8699254840612411, + "num_tokens": 48550340.0, + "step": 454 + }, + { + "epoch": 1.0364880273660204, + "grad_norm": 5.71875, + "learning_rate": 4.961252444837809e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.8699039667844772, + "num_tokens": 48657031.0, + "step": 455 + }, + { + "epoch": 1.0387685290763968, + "grad_norm": 4.90625, + "learning_rate": 4.960921428851066e-06, + "loss": 0.6169, + "mean_token_accuracy": 0.8748660534620285, + "num_tokens": 48763597.0, + "step": 456 + }, + { + "epoch": 1.041049030786773, + "grad_norm": 5.0, + "learning_rate": 4.960589016092832e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8746305704116821, + "num_tokens": 48871423.0, + "step": 457 + }, + { + "epoch": 1.0433295324971494, + "grad_norm": 6.84375, + "learning_rate": 4.960255206751774e-06, + "loss": 0.627, + "mean_token_accuracy": 0.8704584836959839, + "num_tokens": 48978350.0, + "step": 458 + }, + { + "epoch": 1.0456100342075256, + "grad_norm": 3.515625, + "learning_rate": 4.959920001017358e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.8744053989648819, + "num_tokens": 49085534.0, + "step": 459 + }, + { + "epoch": 1.047890535917902, + "grad_norm": 4.59375, + "learning_rate": 4.95958339907984e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.8731739073991776, + "num_tokens": 49192856.0, + "step": 460 + }, + { + "epoch": 1.0501710376282782, + "grad_norm": 9.1875, + "learning_rate": 4.959245401130269e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8677381575107574, + "num_tokens": 49299373.0, + "step": 461 + }, + { + "epoch": 1.0524515393386544, + "grad_norm": 6.5625, + "learning_rate": 4.958906007360487e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.876073032617569, + "num_tokens": 49406428.0, + "step": 462 + }, + { + "epoch": 1.0547320410490308, + "grad_norm": 3.96875, + "learning_rate": 4.958565217963125e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.8712072819471359, + "num_tokens": 49513674.0, + "step": 463 + }, + { + "epoch": 1.057012542759407, + "grad_norm": 4.65625, + "learning_rate": 4.95822303313161e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.8719818741083145, + "num_tokens": 49620690.0, + "step": 464 + }, + { + "epoch": 1.0592930444697835, + "grad_norm": 2.4375, + "learning_rate": 4.957879453060159e-06, + "loss": 0.607, + "mean_token_accuracy": 0.874313622713089, + "num_tokens": 49728319.0, + "step": 465 + }, + { + "epoch": 1.0615735461801596, + "grad_norm": 2.859375, + "learning_rate": 4.957534477943782e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8772328495979309, + "num_tokens": 49835825.0, + "step": 466 + }, + { + "epoch": 1.0638540478905358, + "grad_norm": 4.71875, + "learning_rate": 4.957188107978279e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.8736355155706406, + "num_tokens": 49943303.0, + "step": 467 + }, + { + "epoch": 1.0661345496009123, + "grad_norm": 5.3125, + "learning_rate": 4.956840343360245e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8750191628932953, + "num_tokens": 50050828.0, + "step": 468 + }, + { + "epoch": 1.0684150513112884, + "grad_norm": 2.71875, + "learning_rate": 4.956491184287062e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.8774718195199966, + "num_tokens": 50158185.0, + "step": 469 + }, + { + "epoch": 1.0706955530216649, + "grad_norm": 2.484375, + "learning_rate": 4.9561406309569084e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.8694217354059219, + "num_tokens": 50265385.0, + "step": 470 + }, + { + "epoch": 1.072976054732041, + "grad_norm": 2.78125, + "learning_rate": 4.955788683568749e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.8717086911201477, + "num_tokens": 50372238.0, + "step": 471 + }, + { + "epoch": 1.0752565564424172, + "grad_norm": 3.625, + "learning_rate": 4.955435342322345e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.8751126825809479, + "num_tokens": 50479115.0, + "step": 472 + }, + { + "epoch": 1.0775370581527937, + "grad_norm": 3.6875, + "learning_rate": 4.955080607418244e-06, + "loss": 0.6061, + "mean_token_accuracy": 0.8753761649131775, + "num_tokens": 50586257.0, + "step": 473 + }, + { + "epoch": 1.0798175598631699, + "grad_norm": 2.421875, + "learning_rate": 4.954724479057788e-06, + "loss": 0.6195, + "mean_token_accuracy": 0.8744408041238785, + "num_tokens": 50692536.0, + "step": 474 + }, + { + "epoch": 1.0820980615735463, + "grad_norm": 2.453125, + "learning_rate": 4.954366957443107e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.8744495362043381, + "num_tokens": 50799101.0, + "step": 475 + }, + { + "epoch": 1.0843785632839225, + "grad_norm": 4.96875, + "learning_rate": 4.954008042777125e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.871866300702095, + "num_tokens": 50906128.0, + "step": 476 + }, + { + "epoch": 1.0866590649942987, + "grad_norm": 3.65625, + "learning_rate": 4.953647735263555e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.8779451102018356, + "num_tokens": 51013166.0, + "step": 477 + }, + { + "epoch": 1.088939566704675, + "grad_norm": 3.40625, + "learning_rate": 4.953286035106898e-06, + "loss": 0.6202, + "mean_token_accuracy": 0.8745466768741608, + "num_tokens": 51120460.0, + "step": 478 + }, + { + "epoch": 1.0912200684150513, + "grad_norm": 2.140625, + "learning_rate": 4.952922942512452e-06, + "loss": 0.6224, + "mean_token_accuracy": 0.8701803684234619, + "num_tokens": 51227376.0, + "step": 479 + }, + { + "epoch": 1.0935005701254277, + "grad_norm": 1.8515625, + "learning_rate": 4.9525584576862985e-06, + "loss": 0.623, + "mean_token_accuracy": 0.8713761270046234, + "num_tokens": 51334284.0, + "step": 480 + }, + { + "epoch": 1.0957810718358039, + "grad_norm": 3.71875, + "learning_rate": 4.952192580835313e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.8694994300603867, + "num_tokens": 51441531.0, + "step": 481 + }, + { + "epoch": 1.09806157354618, + "grad_norm": 4.1875, + "learning_rate": 4.9518253121671595e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.8716080784797668, + "num_tokens": 51549055.0, + "step": 482 + }, + { + "epoch": 1.1003420752565565, + "grad_norm": 2.796875, + "learning_rate": 4.951456651890294e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.881346806883812, + "num_tokens": 51656798.0, + "step": 483 + }, + { + "epoch": 1.1026225769669327, + "grad_norm": 2.359375, + "learning_rate": 4.951086600213959e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.8716641664505005, + "num_tokens": 51763790.0, + "step": 484 + }, + { + "epoch": 1.104903078677309, + "grad_norm": 2.5625, + "learning_rate": 4.950715157348191e-06, + "loss": 0.6196, + "mean_token_accuracy": 0.8753807693719864, + "num_tokens": 51870653.0, + "step": 485 + }, + { + "epoch": 1.1071835803876853, + "grad_norm": 2.921875, + "learning_rate": 4.950342323503812e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.870696559548378, + "num_tokens": 51977455.0, + "step": 486 + }, + { + "epoch": 1.1094640820980617, + "grad_norm": 4.625, + "learning_rate": 4.949968098892436e-06, + "loss": 0.642, + "mean_token_accuracy": 0.8685554414987564, + "num_tokens": 52083822.0, + "step": 487 + }, + { + "epoch": 1.1117445838084379, + "grad_norm": 2.484375, + "learning_rate": 4.949592483726465e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.8732410371303558, + "num_tokens": 52191177.0, + "step": 488 + }, + { + "epoch": 1.114025085518814, + "grad_norm": 2.140625, + "learning_rate": 4.949215478219092e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.8758023232221603, + "num_tokens": 52297640.0, + "step": 489 + }, + { + "epoch": 1.1163055872291905, + "grad_norm": 2.0625, + "learning_rate": 4.948837082584298e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.8783663511276245, + "num_tokens": 52404875.0, + "step": 490 + }, + { + "epoch": 1.1185860889395667, + "grad_norm": 4.5625, + "learning_rate": 4.9484572970368516e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.8689621537923813, + "num_tokens": 52511463.0, + "step": 491 + }, + { + "epoch": 1.120866590649943, + "grad_norm": 3.234375, + "learning_rate": 4.948076121792313e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.8712314665317535, + "num_tokens": 52617733.0, + "step": 492 + }, + { + "epoch": 1.1231470923603193, + "grad_norm": 2.5625, + "learning_rate": 4.9476935570670294e-06, + "loss": 0.6101, + "mean_token_accuracy": 0.8728629648685455, + "num_tokens": 52725104.0, + "step": 493 + }, + { + "epoch": 1.1254275940706955, + "grad_norm": 2.796875, + "learning_rate": 4.947309603078138e-06, + "loss": 0.646, + "mean_token_accuracy": 0.8683103322982788, + "num_tokens": 52832460.0, + "step": 494 + }, + { + "epoch": 1.127708095781072, + "grad_norm": 4.84375, + "learning_rate": 4.946924260043563e-06, + "loss": 0.6341, + "mean_token_accuracy": 0.8687001317739487, + "num_tokens": 52939314.0, + "step": 495 + }, + { + "epoch": 1.129988597491448, + "grad_norm": 2.65625, + "learning_rate": 4.946537528182017e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.8715783804655075, + "num_tokens": 53045997.0, + "step": 496 + }, + { + "epoch": 1.1322690992018245, + "grad_norm": 2.5, + "learning_rate": 4.946149407713002e-06, + "loss": 0.6211, + "mean_token_accuracy": 0.8704388439655304, + "num_tokens": 53153160.0, + "step": 497 + }, + { + "epoch": 1.1345496009122007, + "grad_norm": 2.78125, + "learning_rate": 4.945759898856809e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.8732296079397202, + "num_tokens": 53260206.0, + "step": 498 + }, + { + "epoch": 1.1368301026225769, + "grad_norm": 2.140625, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.8706572353839874, + "num_tokens": 53366657.0, + "step": 499 + }, + { + "epoch": 1.1391106043329533, + "grad_norm": 3.25, + "learning_rate": 4.944976716867984e-06, + "loss": 0.635, + "mean_token_accuracy": 0.8699849843978882, + "num_tokens": 53473137.0, + "step": 500 + }, + { + "epoch": 1.1413911060433295, + "grad_norm": 2.09375, + "learning_rate": 4.944583044179871e-06, + "loss": 0.6214, + "mean_token_accuracy": 0.875198557972908, + "num_tokens": 53580438.0, + "step": 501 + }, + { + "epoch": 1.143671607753706, + "grad_norm": 2.421875, + "learning_rate": 4.944187983993617e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8774997144937515, + "num_tokens": 53687875.0, + "step": 502 + }, + { + "epoch": 1.145952109464082, + "grad_norm": 3.453125, + "learning_rate": 4.94379153653345e-06, + "loss": 0.6153, + "mean_token_accuracy": 0.8719532340764999, + "num_tokens": 53794260.0, + "step": 503 + }, + { + "epoch": 1.1482326111744583, + "grad_norm": 3.890625, + "learning_rate": 4.9433937020243854e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.869365006685257, + "num_tokens": 53901213.0, + "step": 504 + }, + { + "epoch": 1.1505131128848347, + "grad_norm": 2.609375, + "learning_rate": 4.942994480692228e-06, + "loss": 0.6199, + "mean_token_accuracy": 0.8735174834728241, + "num_tokens": 54007865.0, + "step": 505 + }, + { + "epoch": 1.152793614595211, + "grad_norm": 3.3125, + "learning_rate": 4.942593872763566e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8790851831436157, + "num_tokens": 54114696.0, + "step": 506 + }, + { + "epoch": 1.1550741163055873, + "grad_norm": 2.984375, + "learning_rate": 4.9421918784657795e-06, + "loss": 0.6112, + "mean_token_accuracy": 0.8767211586236954, + "num_tokens": 54221690.0, + "step": 507 + }, + { + "epoch": 1.1573546180159635, + "grad_norm": 3.390625, + "learning_rate": 4.94178849802703e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8764821738004684, + "num_tokens": 54328424.0, + "step": 508 + }, + { + "epoch": 1.1596351197263397, + "grad_norm": 2.40625, + "learning_rate": 4.9413837316762705e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8703786879777908, + "num_tokens": 54435061.0, + "step": 509 + }, + { + "epoch": 1.1619156214367161, + "grad_norm": 2.109375, + "learning_rate": 4.940977579643237e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8734538704156876, + "num_tokens": 54541917.0, + "step": 510 + }, + { + "epoch": 1.1641961231470923, + "grad_norm": 3.15625, + "learning_rate": 4.940570042158454e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.8750556707382202, + "num_tokens": 54649030.0, + "step": 511 + }, + { + "epoch": 1.1664766248574687, + "grad_norm": 4.15625, + "learning_rate": 4.940161119453232e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.8727173060178757, + "num_tokens": 54756245.0, + "step": 512 + }, + { + "epoch": 1.168757126567845, + "grad_norm": 4.53125, + "learning_rate": 4.939750811759668e-06, + "loss": 0.6083, + "mean_token_accuracy": 0.8715848624706268, + "num_tokens": 54863315.0, + "step": 513 + }, + { + "epoch": 1.171037628278221, + "grad_norm": 2.46875, + "learning_rate": 4.939339119310645e-06, + "loss": 0.621, + "mean_token_accuracy": 0.8712294846773148, + "num_tokens": 54970714.0, + "step": 514 + }, + { + "epoch": 1.1733181299885975, + "grad_norm": 3.453125, + "learning_rate": 4.93892604233983e-06, + "loss": 0.6305, + "mean_token_accuracy": 0.8707058429718018, + "num_tokens": 55077726.0, + "step": 515 + }, + { + "epoch": 1.1755986316989737, + "grad_norm": 4.5625, + "learning_rate": 4.93851158108168e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8780573010444641, + "num_tokens": 55185327.0, + "step": 516 + }, + { + "epoch": 1.1778791334093501, + "grad_norm": 3.953125, + "learning_rate": 4.938095735771433e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.8706268519163132, + "num_tokens": 55292419.0, + "step": 517 + }, + { + "epoch": 1.1801596351197263, + "grad_norm": 1.8984375, + "learning_rate": 4.937678506645116e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.873799741268158, + "num_tokens": 55399684.0, + "step": 518 + }, + { + "epoch": 1.1824401368301025, + "grad_norm": 5.84375, + "learning_rate": 4.937259893939539e-06, + "loss": 0.6202, + "mean_token_accuracy": 0.8741802275180817, + "num_tokens": 55507140.0, + "step": 519 + }, + { + "epoch": 1.184720638540479, + "grad_norm": 4.78125, + "learning_rate": 4.9368398978923e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.8741926103830338, + "num_tokens": 55614345.0, + "step": 520 + }, + { + "epoch": 1.1870011402508551, + "grad_norm": 2.984375, + "learning_rate": 4.93641851874178e-06, + "loss": 0.631, + "mean_token_accuracy": 0.8710474520921707, + "num_tokens": 55721149.0, + "step": 521 + }, + { + "epoch": 1.1892816419612315, + "grad_norm": 4.5, + "learning_rate": 4.935995756727146e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.8732537627220154, + "num_tokens": 55828356.0, + "step": 522 + }, + { + "epoch": 1.1915621436716077, + "grad_norm": 2.53125, + "learning_rate": 4.935571612088349e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.875878319144249, + "num_tokens": 55935753.0, + "step": 523 + }, + { + "epoch": 1.193842645381984, + "grad_norm": 2.96875, + "learning_rate": 4.935146085066125e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.874293178319931, + "num_tokens": 56042443.0, + "step": 524 + }, + { + "epoch": 1.1961231470923603, + "grad_norm": 2.59375, + "learning_rate": 4.934719175901996e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8778766244649887, + "num_tokens": 56149378.0, + "step": 525 + }, + { + "epoch": 1.1984036488027365, + "grad_norm": 3.421875, + "learning_rate": 4.934290884838266e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.8713551312685013, + "num_tokens": 56256623.0, + "step": 526 + }, + { + "epoch": 1.200684150513113, + "grad_norm": 3.703125, + "learning_rate": 4.933861212118027e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.8705524206161499, + "num_tokens": 56363653.0, + "step": 527 + }, + { + "epoch": 1.2029646522234891, + "grad_norm": 3.828125, + "learning_rate": 4.933430157985151e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.8724389970302582, + "num_tokens": 56470381.0, + "step": 528 + }, + { + "epoch": 1.2052451539338653, + "grad_norm": 2.59375, + "learning_rate": 4.932997722684296e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8716868907213211, + "num_tokens": 56577262.0, + "step": 529 + }, + { + "epoch": 1.2075256556442417, + "grad_norm": 4.625, + "learning_rate": 4.932563906460905e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8729743212461472, + "num_tokens": 56683964.0, + "step": 530 + }, + { + "epoch": 1.209806157354618, + "grad_norm": 7.375, + "learning_rate": 4.932128709561202e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.8723554909229279, + "num_tokens": 56791540.0, + "step": 531 + }, + { + "epoch": 1.2120866590649944, + "grad_norm": 6.375, + "learning_rate": 4.931692132232198e-06, + "loss": 0.601, + "mean_token_accuracy": 0.8752623051404953, + "num_tokens": 56898876.0, + "step": 532 + }, + { + "epoch": 1.2143671607753705, + "grad_norm": 3.40625, + "learning_rate": 4.931254174721687e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.867269441485405, + "num_tokens": 57005808.0, + "step": 533 + }, + { + "epoch": 1.216647662485747, + "grad_norm": 2.125, + "learning_rate": 4.930814837278242e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.8760450780391693, + "num_tokens": 57113633.0, + "step": 534 + }, + { + "epoch": 1.2189281641961232, + "grad_norm": 5.71875, + "learning_rate": 4.930374120151225e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.8680548220872879, + "num_tokens": 57220723.0, + "step": 535 + }, + { + "epoch": 1.2212086659064993, + "grad_norm": 5.71875, + "learning_rate": 4.929932023590776e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.867982491850853, + "num_tokens": 57327185.0, + "step": 536 + }, + { + "epoch": 1.2234891676168758, + "grad_norm": 7.25, + "learning_rate": 4.929488547847823e-06, + "loss": 0.6068, + "mean_token_accuracy": 0.8724008947610855, + "num_tokens": 57433998.0, + "step": 537 + }, + { + "epoch": 1.225769669327252, + "grad_norm": 5.0, + "learning_rate": 4.9290436931740735e-06, + "loss": 0.6194, + "mean_token_accuracy": 0.8717511296272278, + "num_tokens": 57541338.0, + "step": 538 + }, + { + "epoch": 1.2280501710376284, + "grad_norm": 3.171875, + "learning_rate": 4.928597459822018e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.879209354519844, + "num_tokens": 57648210.0, + "step": 539 + }, + { + "epoch": 1.2303306727480046, + "grad_norm": 3.359375, + "learning_rate": 4.928149848044931e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8770927637815475, + "num_tokens": 57755187.0, + "step": 540 + }, + { + "epoch": 1.2326111744583808, + "grad_norm": 4.375, + "learning_rate": 4.9277008580968665e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8778523355722427, + "num_tokens": 57862318.0, + "step": 541 + }, + { + "epoch": 1.2348916761687572, + "grad_norm": 5.03125, + "learning_rate": 4.927250490232664e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.8774797171354294, + "num_tokens": 57969230.0, + "step": 542 + }, + { + "epoch": 1.2371721778791334, + "grad_norm": 3.109375, + "learning_rate": 4.926798744707943e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.8759450018405914, + "num_tokens": 58076744.0, + "step": 543 + }, + { + "epoch": 1.2394526795895098, + "grad_norm": 2.359375, + "learning_rate": 4.926345621779106e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.8749475181102753, + "num_tokens": 58184125.0, + "step": 544 + }, + { + "epoch": 1.241733181299886, + "grad_norm": 2.015625, + "learning_rate": 4.9258911217033355e-06, + "loss": 0.6161, + "mean_token_accuracy": 0.8739427775144577, + "num_tokens": 58290980.0, + "step": 545 + }, + { + "epoch": 1.2440136830102622, + "grad_norm": 3.453125, + "learning_rate": 4.925435244738599e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.8763206750154495, + "num_tokens": 58397982.0, + "step": 546 + }, + { + "epoch": 1.2462941847206386, + "grad_norm": 2.71875, + "learning_rate": 4.924977991143642e-06, + "loss": 0.5985, + "mean_token_accuracy": 0.8783524632453918, + "num_tokens": 58505934.0, + "step": 547 + }, + { + "epoch": 1.2485746864310148, + "grad_norm": 2.5, + "learning_rate": 4.924519361177993e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.8753952831029892, + "num_tokens": 58613394.0, + "step": 548 + }, + { + "epoch": 1.2508551881413912, + "grad_norm": 1.8984375, + "learning_rate": 4.9240593551019625e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8755134344100952, + "num_tokens": 58721077.0, + "step": 549 + }, + { + "epoch": 1.2531356898517674, + "grad_norm": 2.484375, + "learning_rate": 4.92359797317664e-06, + "loss": 0.6153, + "mean_token_accuracy": 0.8757786601781845, + "num_tokens": 58828151.0, + "step": 550 + }, + { + "epoch": 1.2554161915621438, + "grad_norm": 2.375, + "learning_rate": 4.923135215663897e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.8740972727537155, + "num_tokens": 58935083.0, + "step": 551 + }, + { + "epoch": 1.25769669327252, + "grad_norm": 4.09375, + "learning_rate": 4.922671082826386e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.8730472475290298, + "num_tokens": 59041838.0, + "step": 552 + }, + { + "epoch": 1.2599771949828962, + "grad_norm": 2.3125, + "learning_rate": 4.92220557492754e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.8769043833017349, + "num_tokens": 59149008.0, + "step": 553 + }, + { + "epoch": 1.2622576966932726, + "grad_norm": 2.328125, + "learning_rate": 4.921738692231572e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8772878795862198, + "num_tokens": 59255903.0, + "step": 554 + }, + { + "epoch": 1.2645381984036488, + "grad_norm": 2.203125, + "learning_rate": 4.9212704350034764e-06, + "loss": 0.6136, + "mean_token_accuracy": 0.8748277425765991, + "num_tokens": 59362809.0, + "step": 555 + }, + { + "epoch": 1.2668187001140252, + "grad_norm": 2.921875, + "learning_rate": 4.920800803509026e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.87485072016716, + "num_tokens": 59469454.0, + "step": 556 + }, + { + "epoch": 1.2690992018244014, + "grad_norm": 4.1875, + "learning_rate": 4.920329798014775e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.8719239979982376, + "num_tokens": 59576709.0, + "step": 557 + }, + { + "epoch": 1.2713797035347776, + "grad_norm": 2.265625, + "learning_rate": 4.919857418788056e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.8737199753522873, + "num_tokens": 59683387.0, + "step": 558 + }, + { + "epoch": 1.273660205245154, + "grad_norm": 2.53125, + "learning_rate": 4.919383666096985e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.8742072433233261, + "num_tokens": 59790354.0, + "step": 559 + }, + { + "epoch": 1.2759407069555302, + "grad_norm": 4.8125, + "learning_rate": 4.918908540210452e-06, + "loss": 0.6398, + "mean_token_accuracy": 0.869797870516777, + "num_tokens": 59897756.0, + "step": 560 + }, + { + "epoch": 1.2782212086659066, + "grad_norm": 2.859375, + "learning_rate": 4.91843204139813e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.88178950548172, + "num_tokens": 60004961.0, + "step": 561 + }, + { + "epoch": 1.2805017103762828, + "grad_norm": 2.40625, + "learning_rate": 4.917954169930472e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8791621029376984, + "num_tokens": 60112141.0, + "step": 562 + }, + { + "epoch": 1.282782212086659, + "grad_norm": 2.46875, + "learning_rate": 4.917474926078707e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.8750548511743546, + "num_tokens": 60220073.0, + "step": 563 + }, + { + "epoch": 1.2850627137970354, + "grad_norm": 2.75, + "learning_rate": 4.916994310114845e-06, + "loss": 0.5995, + "mean_token_accuracy": 0.8793503940105438, + "num_tokens": 60327794.0, + "step": 564 + }, + { + "epoch": 1.2873432155074116, + "grad_norm": 3.0625, + "learning_rate": 4.916512322311675e-06, + "loss": 0.6275, + "mean_token_accuracy": 0.86960369348526, + "num_tokens": 60434871.0, + "step": 565 + }, + { + "epoch": 1.289623717217788, + "grad_norm": 3.703125, + "learning_rate": 4.916028962942763e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.8765391111373901, + "num_tokens": 60542243.0, + "step": 566 + }, + { + "epoch": 1.2919042189281642, + "grad_norm": 2.890625, + "learning_rate": 4.915544232282455e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8739331364631653, + "num_tokens": 60649576.0, + "step": 567 + }, + { + "epoch": 1.2941847206385404, + "grad_norm": 1.8828125, + "learning_rate": 4.915058130605874e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8797845989465714, + "num_tokens": 60756480.0, + "step": 568 + }, + { + "epoch": 1.2964652223489168, + "grad_norm": 1.9609375, + "learning_rate": 4.9145706581889235e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8693940490484238, + "num_tokens": 60863068.0, + "step": 569 + }, + { + "epoch": 1.298745724059293, + "grad_norm": 5.1875, + "learning_rate": 4.914081815308283e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8792444169521332, + "num_tokens": 60969966.0, + "step": 570 + }, + { + "epoch": 1.3010262257696694, + "grad_norm": 2.75, + "learning_rate": 4.913591602241409e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.8721539825201035, + "num_tokens": 61076972.0, + "step": 571 + }, + { + "epoch": 1.3033067274800456, + "grad_norm": 3.53125, + "learning_rate": 4.9131000192665365e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.8705037534236908, + "num_tokens": 61183878.0, + "step": 572 + }, + { + "epoch": 1.3055872291904218, + "grad_norm": 2.671875, + "learning_rate": 4.9126070666626815e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.8774714469909668, + "num_tokens": 61290915.0, + "step": 573 + }, + { + "epoch": 1.3078677309007982, + "grad_norm": 6.53125, + "learning_rate": 4.912112744709632e-06, + "loss": 0.599, + "mean_token_accuracy": 0.876824364066124, + "num_tokens": 61397686.0, + "step": 574 + }, + { + "epoch": 1.3101482326111744, + "grad_norm": 4.25, + "learning_rate": 4.911617053687957e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8789774626493454, + "num_tokens": 61504802.0, + "step": 575 + }, + { + "epoch": 1.3124287343215508, + "grad_norm": 2.453125, + "learning_rate": 4.911119993878999e-06, + "loss": 0.615, + "mean_token_accuracy": 0.8741600215435028, + "num_tokens": 61611778.0, + "step": 576 + }, + { + "epoch": 1.314709236031927, + "grad_norm": 2.65625, + "learning_rate": 4.910621565564882e-06, + "loss": 0.6029, + "mean_token_accuracy": 0.8803430199623108, + "num_tokens": 61719136.0, + "step": 577 + }, + { + "epoch": 1.3169897377423032, + "grad_norm": 4.375, + "learning_rate": 4.910121769028503e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.8754008263349533, + "num_tokens": 61826166.0, + "step": 578 + }, + { + "epoch": 1.3192702394526796, + "grad_norm": 8.625, + "learning_rate": 4.909620604553537e-06, + "loss": 0.6315, + "mean_token_accuracy": 0.8695260733366013, + "num_tokens": 61933001.0, + "step": 579 + }, + { + "epoch": 1.3215507411630558, + "grad_norm": 7.34375, + "learning_rate": 4.909118072424436e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.880437433719635, + "num_tokens": 62039905.0, + "step": 580 + }, + { + "epoch": 1.3238312428734322, + "grad_norm": 6.21875, + "learning_rate": 4.908614172926426e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8764496594667435, + "num_tokens": 62147016.0, + "step": 581 + }, + { + "epoch": 1.3261117445838084, + "grad_norm": 3.203125, + "learning_rate": 4.908108906345512e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.8747891783714294, + "num_tokens": 62254504.0, + "step": 582 + }, + { + "epoch": 1.3283922462941846, + "grad_norm": 3.75, + "learning_rate": 4.907602272968473e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.8759507387876511, + "num_tokens": 62361146.0, + "step": 583 + }, + { + "epoch": 1.330672748004561, + "grad_norm": 2.734375, + "learning_rate": 4.907094273082865e-06, + "loss": 0.6254, + "mean_token_accuracy": 0.8714919984340668, + "num_tokens": 62467825.0, + "step": 584 + }, + { + "epoch": 1.3329532497149372, + "grad_norm": 2.8125, + "learning_rate": 4.906584906977018e-06, + "loss": 0.606, + "mean_token_accuracy": 0.8751325905323029, + "num_tokens": 62575437.0, + "step": 585 + }, + { + "epoch": 1.3352337514253136, + "grad_norm": 2.171875, + "learning_rate": 4.906074174940038e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8743661940097809, + "num_tokens": 62682439.0, + "step": 586 + }, + { + "epoch": 1.3375142531356898, + "grad_norm": 2.125, + "learning_rate": 4.905562077261808e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.8771595358848572, + "num_tokens": 62789139.0, + "step": 587 + }, + { + "epoch": 1.339794754846066, + "grad_norm": 2.1875, + "learning_rate": 4.905048614232984e-06, + "loss": 0.62, + "mean_token_accuracy": 0.8709544539451599, + "num_tokens": 62896172.0, + "step": 588 + }, + { + "epoch": 1.3420752565564424, + "grad_norm": 1.9765625, + "learning_rate": 4.904533786144998e-06, + "loss": 0.6211, + "mean_token_accuracy": 0.8748406618833542, + "num_tokens": 63003536.0, + "step": 589 + }, + { + "epoch": 1.3443557582668186, + "grad_norm": 2.296875, + "learning_rate": 4.904017593290056e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8730844259262085, + "num_tokens": 63110204.0, + "step": 590 + }, + { + "epoch": 1.346636259977195, + "grad_norm": 3.03125, + "learning_rate": 4.903500035961139e-06, + "loss": 0.5986, + "mean_token_accuracy": 0.8759681731462479, + "num_tokens": 63217782.0, + "step": 591 + }, + { + "epoch": 1.3489167616875712, + "grad_norm": 2.046875, + "learning_rate": 4.902981114452005e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.8746581524610519, + "num_tokens": 63324751.0, + "step": 592 + }, + { + "epoch": 1.3511972633979474, + "grad_norm": 2.140625, + "learning_rate": 4.90246082905718e-06, + "loss": 0.617, + "mean_token_accuracy": 0.8720570355653763, + "num_tokens": 63431709.0, + "step": 593 + }, + { + "epoch": 1.3534777651083238, + "grad_norm": 3.09375, + "learning_rate": 4.90193918007197e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.8742910176515579, + "num_tokens": 63538728.0, + "step": 594 + }, + { + "epoch": 1.3557582668187, + "grad_norm": 2.921875, + "learning_rate": 4.901416167792452e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.875694528222084, + "num_tokens": 63645888.0, + "step": 595 + }, + { + "epoch": 1.3580387685290765, + "grad_norm": 2.71875, + "learning_rate": 4.9008917925154795e-06, + "loss": 0.595, + "mean_token_accuracy": 0.8773985803127289, + "num_tokens": 63753292.0, + "step": 596 + }, + { + "epoch": 1.3603192702394526, + "grad_norm": 2.046875, + "learning_rate": 4.900366054538675e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.8744669109582901, + "num_tokens": 63859832.0, + "step": 597 + }, + { + "epoch": 1.3625997719498288, + "grad_norm": 9.0625, + "learning_rate": 4.8998389541604405e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.8743558824062347, + "num_tokens": 63966869.0, + "step": 598 + }, + { + "epoch": 1.3648802736602053, + "grad_norm": 7.9375, + "learning_rate": 4.899310491679945e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.8777134567499161, + "num_tokens": 64074013.0, + "step": 599 + }, + { + "epoch": 1.3671607753705814, + "grad_norm": 3.015625, + "learning_rate": 4.898780667397136e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8756420910358429, + "num_tokens": 64181228.0, + "step": 600 + }, + { + "epoch": 1.3694412770809579, + "grad_norm": 4.9375, + "learning_rate": 4.89824948161273e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8778956681489944, + "num_tokens": 64288189.0, + "step": 601 + }, + { + "epoch": 1.371721778791334, + "grad_norm": 2.140625, + "learning_rate": 4.8977169346282184e-06, + "loss": 0.6101, + "mean_token_accuracy": 0.8754762411117554, + "num_tokens": 64395116.0, + "step": 602 + }, + { + "epoch": 1.3740022805017102, + "grad_norm": 2.59375, + "learning_rate": 4.8971830267458645e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8785329908132553, + "num_tokens": 64502388.0, + "step": 603 + }, + { + "epoch": 1.3762827822120867, + "grad_norm": 2.90625, + "learning_rate": 4.896647758268703e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8748378753662109, + "num_tokens": 64609541.0, + "step": 604 + }, + { + "epoch": 1.378563283922463, + "grad_norm": 3.8125, + "learning_rate": 4.8961111295005444e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.8767747431993484, + "num_tokens": 64716199.0, + "step": 605 + }, + { + "epoch": 1.3808437856328393, + "grad_norm": 2.953125, + "learning_rate": 4.895573140745967e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.8689918667078018, + "num_tokens": 64823123.0, + "step": 606 + }, + { + "epoch": 1.3831242873432155, + "grad_norm": 1.9296875, + "learning_rate": 4.895033792310323e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.879554808139801, + "num_tokens": 64930304.0, + "step": 607 + }, + { + "epoch": 1.3854047890535917, + "grad_norm": 3.859375, + "learning_rate": 4.894493084499736e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.8728460669517517, + "num_tokens": 65037508.0, + "step": 608 + }, + { + "epoch": 1.387685290763968, + "grad_norm": 2.203125, + "learning_rate": 4.893951017621103e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8793998062610626, + "num_tokens": 65145188.0, + "step": 609 + }, + { + "epoch": 1.3899657924743445, + "grad_norm": 6.59375, + "learning_rate": 4.893407591982088e-06, + "loss": 0.6289, + "mean_token_accuracy": 0.8728020936250687, + "num_tokens": 65251638.0, + "step": 610 + }, + { + "epoch": 1.3922462941847207, + "grad_norm": 3.46875, + "learning_rate": 4.892862807891131e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.8728552013635635, + "num_tokens": 65358821.0, + "step": 611 + }, + { + "epoch": 1.3945267958950969, + "grad_norm": 2.203125, + "learning_rate": 4.89231666565744e-06, + "loss": 0.6356, + "mean_token_accuracy": 0.8699388653039932, + "num_tokens": 65466008.0, + "step": 612 + }, + { + "epoch": 1.3968072976054733, + "grad_norm": 2.40625, + "learning_rate": 4.891769165590995e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.8760533332824707, + "num_tokens": 65573422.0, + "step": 613 + }, + { + "epoch": 1.3990877993158495, + "grad_norm": 2.578125, + "learning_rate": 4.891220308002547e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.8774368613958359, + "num_tokens": 65679987.0, + "step": 614 + }, + { + "epoch": 1.401368301026226, + "grad_norm": 2.84375, + "learning_rate": 4.890670093203617e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.8716036677360535, + "num_tokens": 65786496.0, + "step": 615 + }, + { + "epoch": 1.403648802736602, + "grad_norm": 5.25, + "learning_rate": 4.890118521506494e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.8668168038129807, + "num_tokens": 65893146.0, + "step": 616 + }, + { + "epoch": 1.4059293044469783, + "grad_norm": 3.25, + "learning_rate": 4.889565593224242e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8759677559137344, + "num_tokens": 65999804.0, + "step": 617 + }, + { + "epoch": 1.4082098061573547, + "grad_norm": 7.4375, + "learning_rate": 4.889011308670693e-06, + "loss": 0.601, + "mean_token_accuracy": 0.8765671104192734, + "num_tokens": 66106566.0, + "step": 618 + }, + { + "epoch": 1.4104903078677309, + "grad_norm": 3.328125, + "learning_rate": 4.8884556681604445e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.8747722804546356, + "num_tokens": 66213934.0, + "step": 619 + }, + { + "epoch": 1.4127708095781073, + "grad_norm": 4.34375, + "learning_rate": 4.8878986720088715e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.878739207983017, + "num_tokens": 66320771.0, + "step": 620 + }, + { + "epoch": 1.4150513112884835, + "grad_norm": 2.90625, + "learning_rate": 4.8873403205321115e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.8718791306018829, + "num_tokens": 66428168.0, + "step": 621 + }, + { + "epoch": 1.4173318129988597, + "grad_norm": 2.234375, + "learning_rate": 4.886780614047075e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.877121314406395, + "num_tokens": 66535417.0, + "step": 622 + }, + { + "epoch": 1.419612314709236, + "grad_norm": 4.90625, + "learning_rate": 4.886219552871441e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.8733110427856445, + "num_tokens": 66642970.0, + "step": 623 + }, + { + "epoch": 1.4218928164196123, + "grad_norm": 6.0, + "learning_rate": 4.885657137323656e-06, + "loss": 0.6033, + "mean_token_accuracy": 0.8745451271533966, + "num_tokens": 66750313.0, + "step": 624 + }, + { + "epoch": 1.4241733181299887, + "grad_norm": 3.796875, + "learning_rate": 4.885093367722937e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8829841762781143, + "num_tokens": 66857610.0, + "step": 625 + }, + { + "epoch": 1.426453819840365, + "grad_norm": 6.15625, + "learning_rate": 4.884528244389269e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.8738205432891846, + "num_tokens": 66964325.0, + "step": 626 + }, + { + "epoch": 1.428734321550741, + "grad_norm": 1.9921875, + "learning_rate": 4.883961767643404e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8796354234218597, + "num_tokens": 67071449.0, + "step": 627 + }, + { + "epoch": 1.4310148232611175, + "grad_norm": 4.125, + "learning_rate": 4.883393937806864e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.8746996223926544, + "num_tokens": 67178773.0, + "step": 628 + }, + { + "epoch": 1.4332953249714937, + "grad_norm": 6.21875, + "learning_rate": 4.882824755201938e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8757723271846771, + "num_tokens": 67286302.0, + "step": 629 + }, + { + "epoch": 1.4355758266818701, + "grad_norm": 8.4375, + "learning_rate": 4.8822542201516835e-06, + "loss": 0.6061, + "mean_token_accuracy": 0.8711700737476349, + "num_tokens": 67393466.0, + "step": 630 + }, + { + "epoch": 1.4378563283922463, + "grad_norm": 3.46875, + "learning_rate": 4.881682332979925e-06, + "loss": 0.6216, + "mean_token_accuracy": 0.869665339589119, + "num_tokens": 67501293.0, + "step": 631 + }, + { + "epoch": 1.4401368301026225, + "grad_norm": 2.6875, + "learning_rate": 4.881109094011254e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8787660300731659, + "num_tokens": 67609069.0, + "step": 632 + }, + { + "epoch": 1.442417331812999, + "grad_norm": 5.5, + "learning_rate": 4.88053450357103e-06, + "loss": 0.6089, + "mean_token_accuracy": 0.8752091825008392, + "num_tokens": 67716379.0, + "step": 633 + }, + { + "epoch": 1.444697833523375, + "grad_norm": 4.6875, + "learning_rate": 4.87995856198538e-06, + "loss": 0.6164, + "mean_token_accuracy": 0.8721603453159332, + "num_tokens": 67823022.0, + "step": 634 + }, + { + "epoch": 1.4469783352337515, + "grad_norm": 4.0625, + "learning_rate": 4.879381269581197e-06, + "loss": 0.6158, + "mean_token_accuracy": 0.8731086701154709, + "num_tokens": 67929811.0, + "step": 635 + }, + { + "epoch": 1.4492588369441277, + "grad_norm": 2.015625, + "learning_rate": 4.878802626686141e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8828457891941071, + "num_tokens": 68036844.0, + "step": 636 + }, + { + "epoch": 1.451539338654504, + "grad_norm": 2.9375, + "learning_rate": 4.8782226336286395e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.8716375380754471, + "num_tokens": 68144018.0, + "step": 637 + }, + { + "epoch": 1.4538198403648803, + "grad_norm": 2.46875, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8821705430746078, + "num_tokens": 68251176.0, + "step": 638 + }, + { + "epoch": 1.4561003420752565, + "grad_norm": 2.375, + "learning_rate": 4.877058598343835e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.8753101974725723, + "num_tokens": 68358878.0, + "step": 639 + }, + { + "epoch": 1.458380843785633, + "grad_norm": 1.8515625, + "learning_rate": 4.876474556777216e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8794443160295486, + "num_tokens": 68466299.0, + "step": 640 + }, + { + "epoch": 1.4606613454960091, + "grad_norm": 2.46875, + "learning_rate": 4.8758891663695165e-06, + "loss": 0.602, + "mean_token_accuracy": 0.8779085278511047, + "num_tokens": 68573383.0, + "step": 641 + }, + { + "epoch": 1.4629418472063853, + "grad_norm": 2.3125, + "learning_rate": 4.875302427452996e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.8709381222724915, + "num_tokens": 68680054.0, + "step": 642 + }, + { + "epoch": 1.4652223489167617, + "grad_norm": 2.953125, + "learning_rate": 4.874714340360674e-06, + "loss": 0.605, + "mean_token_accuracy": 0.8749048262834549, + "num_tokens": 68787643.0, + "step": 643 + }, + { + "epoch": 1.467502850627138, + "grad_norm": 2.453125, + "learning_rate": 4.874124905426339e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.8763098567724228, + "num_tokens": 68894888.0, + "step": 644 + }, + { + "epoch": 1.4697833523375143, + "grad_norm": 3.0, + "learning_rate": 4.873534122984541e-06, + "loss": 0.611, + "mean_token_accuracy": 0.8762774467468262, + "num_tokens": 69001443.0, + "step": 645 + }, + { + "epoch": 1.4720638540478905, + "grad_norm": 2.125, + "learning_rate": 4.872941993370598e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.882948562502861, + "num_tokens": 69108474.0, + "step": 646 + }, + { + "epoch": 1.4743443557582667, + "grad_norm": 3.265625, + "learning_rate": 4.872348516920591e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.8692348450422287, + "num_tokens": 69215286.0, + "step": 647 + }, + { + "epoch": 1.4766248574686431, + "grad_norm": 1.890625, + "learning_rate": 4.8717536939713665e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8783621490001678, + "num_tokens": 69322791.0, + "step": 648 + }, + { + "epoch": 1.4789053591790193, + "grad_norm": 4.75, + "learning_rate": 4.871157524860533e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8760869204998016, + "num_tokens": 69429973.0, + "step": 649 + }, + { + "epoch": 1.4811858608893957, + "grad_norm": 3.796875, + "learning_rate": 4.870560009926465e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.8765671104192734, + "num_tokens": 69537546.0, + "step": 650 + }, + { + "epoch": 1.483466362599772, + "grad_norm": 2.953125, + "learning_rate": 4.869961149508301e-06, + "loss": 0.6213, + "mean_token_accuracy": 0.8740588128566742, + "num_tokens": 69644078.0, + "step": 651 + }, + { + "epoch": 1.4857468643101481, + "grad_norm": 4.3125, + "learning_rate": 4.869360943945943e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8758237510919571, + "num_tokens": 69750942.0, + "step": 652 + }, + { + "epoch": 1.4880273660205245, + "grad_norm": 3.546875, + "learning_rate": 4.868759393580054e-06, + "loss": 0.6004, + "mean_token_accuracy": 0.8765598982572556, + "num_tokens": 69857823.0, + "step": 653 + }, + { + "epoch": 1.4903078677309007, + "grad_norm": 2.1875, + "learning_rate": 4.868156498752066e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.8746924102306366, + "num_tokens": 69964959.0, + "step": 654 + }, + { + "epoch": 1.4925883694412772, + "grad_norm": 2.34375, + "learning_rate": 4.8675522598041675e-06, + "loss": 0.6092, + "mean_token_accuracy": 0.8732979595661163, + "num_tokens": 70071686.0, + "step": 655 + }, + { + "epoch": 1.4948688711516533, + "grad_norm": 2.109375, + "learning_rate": 4.866946677079314e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.875095933675766, + "num_tokens": 70179037.0, + "step": 656 + }, + { + "epoch": 1.4971493728620295, + "grad_norm": 4.1875, + "learning_rate": 4.866339750921222e-06, + "loss": 0.6294, + "mean_token_accuracy": 0.8688063323497772, + "num_tokens": 70285939.0, + "step": 657 + }, + { + "epoch": 1.499429874572406, + "grad_norm": 3.1875, + "learning_rate": 4.86573148167437e-06, + "loss": 0.6153, + "mean_token_accuracy": 0.8754535466432571, + "num_tokens": 70393254.0, + "step": 658 + }, + { + "epoch": 1.5017103762827824, + "grad_norm": 5.34375, + "learning_rate": 4.865121869684003e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.8764495700597763, + "num_tokens": 70500856.0, + "step": 659 + }, + { + "epoch": 1.5039908779931586, + "grad_norm": 3.390625, + "learning_rate": 4.864510915296122e-06, + "loss": 0.597, + "mean_token_accuracy": 0.8765180259943008, + "num_tokens": 70608662.0, + "step": 660 + }, + { + "epoch": 1.5039908779931586, + "eval_loss": 0.612367570400238, + "eval_mean_token_accuracy": 0.8747105800153638, + "eval_num_tokens": 70608662.0, + "eval_runtime": 58.5955, + "eval_samples_per_second": 143.1, + "eval_steps_per_second": 4.488, + "step": 660 + }, + { + "epoch": 1.5062713797035348, + "grad_norm": 2.109375, + "learning_rate": 4.8638986188574955e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.87701815366745, + "num_tokens": 70715623.0, + "step": 661 + }, + { + "epoch": 1.508551881413911, + "grad_norm": 3.53125, + "learning_rate": 4.863284980715649e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.8763703256845474, + "num_tokens": 70822494.0, + "step": 662 + }, + { + "epoch": 1.5108323831242874, + "grad_norm": 2.265625, + "learning_rate": 4.8626700012188724e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.8716311007738113, + "num_tokens": 70929521.0, + "step": 663 + }, + { + "epoch": 1.5131128848346638, + "grad_norm": 3.171875, + "learning_rate": 4.8620536807162164e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.8747840225696564, + "num_tokens": 71036264.0, + "step": 664 + }, + { + "epoch": 1.51539338654504, + "grad_norm": 2.328125, + "learning_rate": 4.861436019557492e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.8713407516479492, + "num_tokens": 71142930.0, + "step": 665 + }, + { + "epoch": 1.5176738882554162, + "grad_norm": 2.6875, + "learning_rate": 4.8608170180932725e-06, + "loss": 0.6033, + "mean_token_accuracy": 0.8776234537363052, + "num_tokens": 71249846.0, + "step": 666 + }, + { + "epoch": 1.5199543899657924, + "grad_norm": 2.203125, + "learning_rate": 4.860196676674891e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8785290718078613, + "num_tokens": 71356611.0, + "step": 667 + }, + { + "epoch": 1.5222348916761688, + "grad_norm": 2.5625, + "learning_rate": 4.8595749956544414e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.8775981813669205, + "num_tokens": 71463513.0, + "step": 668 + }, + { + "epoch": 1.5245153933865452, + "grad_norm": 3.171875, + "learning_rate": 4.858951975384777e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.874157264828682, + "num_tokens": 71570255.0, + "step": 669 + }, + { + "epoch": 1.5267958950969214, + "grad_norm": 5.875, + "learning_rate": 4.858327616219513e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8791629523038864, + "num_tokens": 71677310.0, + "step": 670 + }, + { + "epoch": 1.5290763968072976, + "grad_norm": 5.15625, + "learning_rate": 4.857701918513023e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.8735476732254028, + "num_tokens": 71783915.0, + "step": 671 + }, + { + "epoch": 1.5313568985176738, + "grad_norm": 3.6875, + "learning_rate": 4.857074882620442e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.8756736516952515, + "num_tokens": 71891495.0, + "step": 672 + }, + { + "epoch": 1.5336374002280502, + "grad_norm": 2.453125, + "learning_rate": 4.856446508897662e-06, + "loss": 0.5949, + "mean_token_accuracy": 0.8775335848331451, + "num_tokens": 71999208.0, + "step": 673 + }, + { + "epoch": 1.5359179019384266, + "grad_norm": 8.1875, + "learning_rate": 4.8558167977013365e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.8727478533983231, + "num_tokens": 72106904.0, + "step": 674 + }, + { + "epoch": 1.5381984036488028, + "grad_norm": 6.59375, + "learning_rate": 4.8551857493888775e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.8790593892335892, + "num_tokens": 72213551.0, + "step": 675 + }, + { + "epoch": 1.540478905359179, + "grad_norm": 6.1875, + "learning_rate": 4.854553364318456e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8734369874000549, + "num_tokens": 72320910.0, + "step": 676 + }, + { + "epoch": 1.5427594070695552, + "grad_norm": 2.53125, + "learning_rate": 4.8539196428490016e-06, + "loss": 0.6387, + "mean_token_accuracy": 0.8696837574243546, + "num_tokens": 72427539.0, + "step": 677 + }, + { + "epoch": 1.5450399087799316, + "grad_norm": 4.84375, + "learning_rate": 4.8532845853402015e-06, + "loss": 0.622, + "mean_token_accuracy": 0.8762244433164597, + "num_tokens": 72534311.0, + "step": 678 + }, + { + "epoch": 1.547320410490308, + "grad_norm": 3.765625, + "learning_rate": 4.8526481921525035e-06, + "loss": 0.615, + "mean_token_accuracy": 0.872207522392273, + "num_tokens": 72641361.0, + "step": 679 + }, + { + "epoch": 1.5496009122006842, + "grad_norm": 3.625, + "learning_rate": 4.85201046364711e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.8779794573783875, + "num_tokens": 72748703.0, + "step": 680 + }, + { + "epoch": 1.5518814139110604, + "grad_norm": 2.65625, + "learning_rate": 4.851371400185986e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.8768866658210754, + "num_tokens": 72855762.0, + "step": 681 + }, + { + "epoch": 1.5541619156214366, + "grad_norm": 2.40625, + "learning_rate": 4.85073100213185e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.8712277412414551, + "num_tokens": 72962645.0, + "step": 682 + }, + { + "epoch": 1.556442417331813, + "grad_norm": 4.34375, + "learning_rate": 4.8500892698481784e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.8731422871351242, + "num_tokens": 73069760.0, + "step": 683 + }, + { + "epoch": 1.5587229190421894, + "grad_norm": 3.609375, + "learning_rate": 4.849446203699209e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.8730466067790985, + "num_tokens": 73176865.0, + "step": 684 + }, + { + "epoch": 1.5610034207525656, + "grad_norm": 2.375, + "learning_rate": 4.848801804049932e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8776687532663345, + "num_tokens": 73284011.0, + "step": 685 + }, + { + "epoch": 1.5632839224629418, + "grad_norm": 3.9375, + "learning_rate": 4.848156071266095e-06, + "loss": 0.5939, + "mean_token_accuracy": 0.8770481199026108, + "num_tokens": 73390880.0, + "step": 686 + }, + { + "epoch": 1.565564424173318, + "grad_norm": 2.78125, + "learning_rate": 4.847509005714207e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.8751052618026733, + "num_tokens": 73497575.0, + "step": 687 + }, + { + "epoch": 1.5678449258836944, + "grad_norm": 4.71875, + "learning_rate": 4.846860607761527e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8736310601234436, + "num_tokens": 73604375.0, + "step": 688 + }, + { + "epoch": 1.5701254275940708, + "grad_norm": 2.171875, + "learning_rate": 4.8462108777760734e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.873234361410141, + "num_tokens": 73712100.0, + "step": 689 + }, + { + "epoch": 1.572405929304447, + "grad_norm": 2.453125, + "learning_rate": 4.845559816126622e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.8704589009284973, + "num_tokens": 73819126.0, + "step": 690 + }, + { + "epoch": 1.5746864310148232, + "grad_norm": 4.25, + "learning_rate": 4.844907423182699e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8772129714488983, + "num_tokens": 73925905.0, + "step": 691 + }, + { + "epoch": 1.5769669327251994, + "grad_norm": 3.90625, + "learning_rate": 4.844253699314596e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.875739112496376, + "num_tokens": 74033596.0, + "step": 692 + }, + { + "epoch": 1.5792474344355758, + "grad_norm": 2.453125, + "learning_rate": 4.843598644893349e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8757044672966003, + "num_tokens": 74140480.0, + "step": 693 + }, + { + "epoch": 1.5815279361459522, + "grad_norm": 3.828125, + "learning_rate": 4.842942260290757e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.8707884252071381, + "num_tokens": 74247691.0, + "step": 694 + }, + { + "epoch": 1.5838084378563284, + "grad_norm": 2.703125, + "learning_rate": 4.84228454587937e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.8712183386087418, + "num_tokens": 74354568.0, + "step": 695 + }, + { + "epoch": 1.5860889395667046, + "grad_norm": 3.328125, + "learning_rate": 4.841625502032495e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8787146657705307, + "num_tokens": 74461346.0, + "step": 696 + }, + { + "epoch": 1.5883694412770808, + "grad_norm": 3.15625, + "learning_rate": 4.84096512912419e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.8755437433719635, + "num_tokens": 74567912.0, + "step": 697 + }, + { + "epoch": 1.5906499429874572, + "grad_norm": 2.21875, + "learning_rate": 4.8403034275292735e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.8786727637052536, + "num_tokens": 74674714.0, + "step": 698 + }, + { + "epoch": 1.5929304446978336, + "grad_norm": 2.984375, + "learning_rate": 4.839640397623312e-06, + "loss": 0.604, + "mean_token_accuracy": 0.8754930794239044, + "num_tokens": 74781656.0, + "step": 699 + }, + { + "epoch": 1.5952109464082098, + "grad_norm": 4.125, + "learning_rate": 4.83897603978263e-06, + "loss": 0.6078, + "mean_token_accuracy": 0.8756787329912186, + "num_tokens": 74888714.0, + "step": 700 + }, + { + "epoch": 1.597491448118586, + "grad_norm": 4.03125, + "learning_rate": 4.838310354384304e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.8733028322458267, + "num_tokens": 74995535.0, + "step": 701 + }, + { + "epoch": 1.5997719498289624, + "grad_norm": 2.171875, + "learning_rate": 4.8376433418061615e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.8739898204803467, + "num_tokens": 75102558.0, + "step": 702 + }, + { + "epoch": 1.6020524515393386, + "grad_norm": 4.6875, + "learning_rate": 4.8369750024267904e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.8737581223249435, + "num_tokens": 75209920.0, + "step": 703 + }, + { + "epoch": 1.604332953249715, + "grad_norm": 4.03125, + "learning_rate": 4.836305336625523e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.8702377676963806, + "num_tokens": 75316855.0, + "step": 704 + }, + { + "epoch": 1.6066134549600912, + "grad_norm": 4.84375, + "learning_rate": 4.835634344782453e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8709095418453217, + "num_tokens": 75423929.0, + "step": 705 + }, + { + "epoch": 1.6088939566704674, + "grad_norm": 2.53125, + "learning_rate": 4.834962027278418e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.873084306716919, + "num_tokens": 75531031.0, + "step": 706 + }, + { + "epoch": 1.6111744583808438, + "grad_norm": 2.28125, + "learning_rate": 4.834288384495015e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.8735803365707397, + "num_tokens": 75637818.0, + "step": 707 + }, + { + "epoch": 1.61345496009122, + "grad_norm": 2.953125, + "learning_rate": 4.833613416814591e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.8781051784753799, + "num_tokens": 75744879.0, + "step": 708 + }, + { + "epoch": 1.6157354618015964, + "grad_norm": 2.421875, + "learning_rate": 4.832937124620243e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8801741451025009, + "num_tokens": 75852406.0, + "step": 709 + }, + { + "epoch": 1.6180159635119726, + "grad_norm": 5.84375, + "learning_rate": 4.832259508295822e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8746786862611771, + "num_tokens": 75959416.0, + "step": 710 + }, + { + "epoch": 1.6202964652223488, + "grad_norm": 4.125, + "learning_rate": 4.831580568225931e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8775349110364914, + "num_tokens": 76067055.0, + "step": 711 + }, + { + "epoch": 1.6225769669327252, + "grad_norm": 2.6875, + "learning_rate": 4.830900304795921e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8796486407518387, + "num_tokens": 76174493.0, + "step": 712 + }, + { + "epoch": 1.6248574686431014, + "grad_norm": 2.546875, + "learning_rate": 4.8302187183918996e-06, + "loss": 0.6092, + "mean_token_accuracy": 0.8721607774496078, + "num_tokens": 76281621.0, + "step": 713 + }, + { + "epoch": 1.6271379703534778, + "grad_norm": 2.546875, + "learning_rate": 4.8295358094007184e-06, + "loss": 0.6131, + "mean_token_accuracy": 0.8758230358362198, + "num_tokens": 76387782.0, + "step": 714 + }, + { + "epoch": 1.629418472063854, + "grad_norm": 3.328125, + "learning_rate": 4.828851578209986e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8804755955934525, + "num_tokens": 76495248.0, + "step": 715 + }, + { + "epoch": 1.6316989737742302, + "grad_norm": 2.5625, + "learning_rate": 4.828166025208059e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8788855522871017, + "num_tokens": 76602322.0, + "step": 716 + }, + { + "epoch": 1.6339794754846066, + "grad_norm": 3.5, + "learning_rate": 4.8274791507840416e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.876452699303627, + "num_tokens": 76709429.0, + "step": 717 + }, + { + "epoch": 1.636259977194983, + "grad_norm": 2.109375, + "learning_rate": 4.826790955327793e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8712098151445389, + "num_tokens": 76817050.0, + "step": 718 + }, + { + "epoch": 1.6385404789053593, + "grad_norm": 2.3125, + "learning_rate": 4.826101439229918e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8716437220573425, + "num_tokens": 76923670.0, + "step": 719 + }, + { + "epoch": 1.6408209806157354, + "grad_norm": 5.0, + "learning_rate": 4.825410602881774e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.8725506961345673, + "num_tokens": 77030550.0, + "step": 720 + }, + { + "epoch": 1.6431014823261116, + "grad_norm": 4.25, + "learning_rate": 4.824718446675465e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.8758846968412399, + "num_tokens": 77137310.0, + "step": 721 + }, + { + "epoch": 1.645381984036488, + "grad_norm": 2.84375, + "learning_rate": 4.8240249710038455e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8754921108484268, + "num_tokens": 77244391.0, + "step": 722 + }, + { + "epoch": 1.6476624857468645, + "grad_norm": 5.40625, + "learning_rate": 4.82333017626052e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8759109824895859, + "num_tokens": 77350980.0, + "step": 723 + }, + { + "epoch": 1.6499429874572407, + "grad_norm": 7.5, + "learning_rate": 4.82263406283984e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.8713529407978058, + "num_tokens": 77457863.0, + "step": 724 + }, + { + "epoch": 1.6522234891676169, + "grad_norm": 5.15625, + "learning_rate": 4.821936631136907e-06, + "loss": 0.612, + "mean_token_accuracy": 0.8779774755239487, + "num_tokens": 77564956.0, + "step": 725 + }, + { + "epoch": 1.654503990877993, + "grad_norm": 2.515625, + "learning_rate": 4.821237881547567e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8784075975418091, + "num_tokens": 77672980.0, + "step": 726 + }, + { + "epoch": 1.6567844925883695, + "grad_norm": 2.234375, + "learning_rate": 4.82053781446842e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8801317662000656, + "num_tokens": 77779982.0, + "step": 727 + }, + { + "epoch": 1.6590649942987459, + "grad_norm": 2.140625, + "learning_rate": 4.819836430296809e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8747014552354813, + "num_tokens": 77886824.0, + "step": 728 + }, + { + "epoch": 1.661345496009122, + "grad_norm": 3.421875, + "learning_rate": 4.819133729430826e-06, + "loss": 0.606, + "mean_token_accuracy": 0.874286100268364, + "num_tokens": 77993947.0, + "step": 729 + }, + { + "epoch": 1.6636259977194983, + "grad_norm": 3.46875, + "learning_rate": 4.818429712269312e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8773009479045868, + "num_tokens": 78101464.0, + "step": 730 + }, + { + "epoch": 1.6659064994298745, + "grad_norm": 4.34375, + "learning_rate": 4.8177243792118515e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8781311810016632, + "num_tokens": 78208412.0, + "step": 731 + }, + { + "epoch": 1.6681870011402509, + "grad_norm": 5.03125, + "learning_rate": 4.8170177306587785e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.8763359040021896, + "num_tokens": 78316677.0, + "step": 732 + }, + { + "epoch": 1.6704675028506273, + "grad_norm": 3.421875, + "learning_rate": 4.8163097670111735e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.8768177330493927, + "num_tokens": 78424519.0, + "step": 733 + }, + { + "epoch": 1.6727480045610035, + "grad_norm": 5.1875, + "learning_rate": 4.815600488670863e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8756203055381775, + "num_tokens": 78531596.0, + "step": 734 + }, + { + "epoch": 1.6750285062713797, + "grad_norm": 2.109375, + "learning_rate": 4.81488989604042e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8781074434518814, + "num_tokens": 78638185.0, + "step": 735 + }, + { + "epoch": 1.6773090079817559, + "grad_norm": 3.3125, + "learning_rate": 4.814177989523162e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8762544244527817, + "num_tokens": 78745981.0, + "step": 736 + }, + { + "epoch": 1.6795895096921323, + "grad_norm": 4.15625, + "learning_rate": 4.813464769523154e-06, + "loss": 0.6209, + "mean_token_accuracy": 0.8740936666727066, + "num_tokens": 78853266.0, + "step": 737 + }, + { + "epoch": 1.6818700114025087, + "grad_norm": 2.53125, + "learning_rate": 4.812750236445206e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.8746587634086609, + "num_tokens": 78960887.0, + "step": 738 + }, + { + "epoch": 1.6841505131128849, + "grad_norm": 2.9375, + "learning_rate": 4.812034390694874e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.871962770819664, + "num_tokens": 79068181.0, + "step": 739 + }, + { + "epoch": 1.686431014823261, + "grad_norm": 2.625, + "learning_rate": 4.811317232678456e-06, + "loss": 0.5949, + "mean_token_accuracy": 0.8781395107507706, + "num_tokens": 79175196.0, + "step": 740 + }, + { + "epoch": 1.6887115165336373, + "grad_norm": 3.703125, + "learning_rate": 4.810598762803e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8768087178468704, + "num_tokens": 79282678.0, + "step": 741 + }, + { + "epoch": 1.6909920182440137, + "grad_norm": 4.65625, + "learning_rate": 4.809878981476293e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.8780290633440018, + "num_tokens": 79389475.0, + "step": 742 + }, + { + "epoch": 1.69327251995439, + "grad_norm": 3.5, + "learning_rate": 4.80915788910687e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.8754473924636841, + "num_tokens": 79496094.0, + "step": 743 + }, + { + "epoch": 1.6955530216647663, + "grad_norm": 4.125, + "learning_rate": 4.80843548610401e-06, + "loss": 0.5918, + "mean_token_accuracy": 0.8804915100336075, + "num_tokens": 79603317.0, + "step": 744 + }, + { + "epoch": 1.6978335233751425, + "grad_norm": 2.28125, + "learning_rate": 4.807711772877733e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.875912070274353, + "num_tokens": 79709931.0, + "step": 745 + }, + { + "epoch": 1.7001140250855187, + "grad_norm": 2.828125, + "learning_rate": 4.8069867498388066e-06, + "loss": 0.6093, + "mean_token_accuracy": 0.8721041977405548, + "num_tokens": 79816476.0, + "step": 746 + }, + { + "epoch": 1.702394526795895, + "grad_norm": 2.359375, + "learning_rate": 4.806260417398739e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.8775873631238937, + "num_tokens": 79923080.0, + "step": 747 + }, + { + "epoch": 1.7046750285062715, + "grad_norm": 2.46875, + "learning_rate": 4.805532775969783e-06, + "loss": 0.607, + "mean_token_accuracy": 0.8796124309301376, + "num_tokens": 80030054.0, + "step": 748 + }, + { + "epoch": 1.7069555302166477, + "grad_norm": 2.1875, + "learning_rate": 4.804803825964933e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8757105469703674, + "num_tokens": 80137000.0, + "step": 749 + }, + { + "epoch": 1.709236031927024, + "grad_norm": 4.9375, + "learning_rate": 4.804073567797928e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.874399796128273, + "num_tokens": 80244175.0, + "step": 750 + }, + { + "epoch": 1.7115165336374, + "grad_norm": 2.59375, + "learning_rate": 4.803342001883247e-06, + "loss": 0.6234, + "mean_token_accuracy": 0.8730274736881256, + "num_tokens": 80351450.0, + "step": 751 + }, + { + "epoch": 1.7137970353477765, + "grad_norm": 3.234375, + "learning_rate": 4.802609128636113e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8778874278068542, + "num_tokens": 80458377.0, + "step": 752 + }, + { + "epoch": 1.716077537058153, + "grad_norm": 4.8125, + "learning_rate": 4.801874948472492e-06, + "loss": 0.6195, + "mean_token_accuracy": 0.8746256977319717, + "num_tokens": 80565320.0, + "step": 753 + }, + { + "epoch": 1.718358038768529, + "grad_norm": 3.359375, + "learning_rate": 4.801139461809089e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.8773505538702011, + "num_tokens": 80672618.0, + "step": 754 + }, + { + "epoch": 1.7206385404789053, + "grad_norm": 4.34375, + "learning_rate": 4.800402669063353e-06, + "loss": 0.6376, + "mean_token_accuracy": 0.8712969422340393, + "num_tokens": 80779049.0, + "step": 755 + }, + { + "epoch": 1.7229190421892815, + "grad_norm": 2.890625, + "learning_rate": 4.799664570653473e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8769599795341492, + "num_tokens": 80886174.0, + "step": 756 + }, + { + "epoch": 1.725199543899658, + "grad_norm": 4.53125, + "learning_rate": 4.79892516699838e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.8780887126922607, + "num_tokens": 80993329.0, + "step": 757 + }, + { + "epoch": 1.7274800456100343, + "grad_norm": 6.96875, + "learning_rate": 4.798184458517745e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8770759999752045, + "num_tokens": 81101103.0, + "step": 758 + }, + { + "epoch": 1.7297605473204105, + "grad_norm": 6.6875, + "learning_rate": 4.797442445631978e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8771584331989288, + "num_tokens": 81208123.0, + "step": 759 + }, + { + "epoch": 1.7320410490307867, + "grad_norm": 3.796875, + "learning_rate": 4.7966991287622335e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.8718820959329605, + "num_tokens": 81315003.0, + "step": 760 + }, + { + "epoch": 1.734321550741163, + "grad_norm": 8.0625, + "learning_rate": 4.795954508330403e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.8779590427875519, + "num_tokens": 81421672.0, + "step": 761 + }, + { + "epoch": 1.7366020524515393, + "grad_norm": 11.4375, + "learning_rate": 4.795208584759119e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8754216283559799, + "num_tokens": 81528963.0, + "step": 762 + }, + { + "epoch": 1.7388825541619157, + "grad_norm": 7.625, + "learning_rate": 4.794461358471753e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8796775341033936, + "num_tokens": 81636332.0, + "step": 763 + }, + { + "epoch": 1.741163055872292, + "grad_norm": 9.5625, + "learning_rate": 4.7937128298924155e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.8761927634477615, + "num_tokens": 81743814.0, + "step": 764 + }, + { + "epoch": 1.7434435575826681, + "grad_norm": 5.3125, + "learning_rate": 4.7929629994459584e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8852293342351913, + "num_tokens": 81851249.0, + "step": 765 + }, + { + "epoch": 1.7457240592930443, + "grad_norm": 4.1875, + "learning_rate": 4.792211867557969e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.8791010826826096, + "num_tokens": 81958684.0, + "step": 766 + }, + { + "epoch": 1.7480045610034207, + "grad_norm": 5.625, + "learning_rate": 4.7914594346547774e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8784136772155762, + "num_tokens": 82065522.0, + "step": 767 + }, + { + "epoch": 1.7502850627137971, + "grad_norm": 5.75, + "learning_rate": 4.790705701163449e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8732837587594986, + "num_tokens": 82172365.0, + "step": 768 + }, + { + "epoch": 1.7525655644241733, + "grad_norm": 4.6875, + "learning_rate": 4.789950667511789e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.87830650806427, + "num_tokens": 82279622.0, + "step": 769 + }, + { + "epoch": 1.7548460661345495, + "grad_norm": 3.203125, + "learning_rate": 4.789194334128338e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.877634271979332, + "num_tokens": 82386626.0, + "step": 770 + }, + { + "epoch": 1.757126567844926, + "grad_norm": 2.46875, + "learning_rate": 4.788436701442378e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.8743524998426437, + "num_tokens": 82494070.0, + "step": 771 + }, + { + "epoch": 1.7594070695553021, + "grad_norm": 4.9375, + "learning_rate": 4.787677769883926e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.8753460049629211, + "num_tokens": 82601259.0, + "step": 772 + }, + { + "epoch": 1.7616875712656785, + "grad_norm": 4.8125, + "learning_rate": 4.786917539883738e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8806093335151672, + "num_tokens": 82708412.0, + "step": 773 + }, + { + "epoch": 1.7639680729760547, + "grad_norm": 5.0625, + "learning_rate": 4.786156011873304e-06, + "loss": 0.6122, + "mean_token_accuracy": 0.8751035928726196, + "num_tokens": 82815320.0, + "step": 774 + }, + { + "epoch": 1.766248574686431, + "grad_norm": 2.875, + "learning_rate": 4.785393186284854e-06, + "loss": 0.5978, + "mean_token_accuracy": 0.8772329390048981, + "num_tokens": 82922009.0, + "step": 775 + }, + { + "epoch": 1.7685290763968073, + "grad_norm": 2.046875, + "learning_rate": 4.784629063551354e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8803988993167877, + "num_tokens": 83029138.0, + "step": 776 + }, + { + "epoch": 1.7708095781071835, + "grad_norm": 2.890625, + "learning_rate": 4.783863644106502e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8786198645830154, + "num_tokens": 83136422.0, + "step": 777 + }, + { + "epoch": 1.77309007981756, + "grad_norm": 2.640625, + "learning_rate": 4.783096928384739e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.8743276447057724, + "num_tokens": 83243905.0, + "step": 778 + }, + { + "epoch": 1.7753705815279361, + "grad_norm": 4.3125, + "learning_rate": 4.782328916821235e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.8728191703557968, + "num_tokens": 83350681.0, + "step": 779 + }, + { + "epoch": 1.7776510832383123, + "grad_norm": 2.984375, + "learning_rate": 4.7815596098519004e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.873179629445076, + "num_tokens": 83457754.0, + "step": 780 + }, + { + "epoch": 1.7799315849486887, + "grad_norm": 2.1875, + "learning_rate": 4.780789007913379e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.8735631853342056, + "num_tokens": 83565053.0, + "step": 781 + }, + { + "epoch": 1.782212086659065, + "grad_norm": 2.28125, + "learning_rate": 4.780017111443048e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8767279386520386, + "num_tokens": 83672036.0, + "step": 782 + }, + { + "epoch": 1.7844925883694414, + "grad_norm": 4.25, + "learning_rate": 4.779243920879023e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.8717406392097473, + "num_tokens": 83778589.0, + "step": 783 + }, + { + "epoch": 1.7867730900798175, + "grad_norm": 2.765625, + "learning_rate": 4.77846943666015e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8808600008487701, + "num_tokens": 83886086.0, + "step": 784 + }, + { + "epoch": 1.7890535917901937, + "grad_norm": 2.5, + "learning_rate": 4.777693659226013e-06, + "loss": 0.612, + "mean_token_accuracy": 0.8717308193445206, + "num_tokens": 83993388.0, + "step": 785 + }, + { + "epoch": 1.7913340935005702, + "grad_norm": 2.40625, + "learning_rate": 4.776916589016928e-06, + "loss": 0.612, + "mean_token_accuracy": 0.874013751745224, + "num_tokens": 84100238.0, + "step": 786 + }, + { + "epoch": 1.7936145952109466, + "grad_norm": 2.90625, + "learning_rate": 4.776138226473944e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8767507672309875, + "num_tokens": 84206817.0, + "step": 787 + }, + { + "epoch": 1.7958950969213228, + "grad_norm": 3.109375, + "learning_rate": 4.775358572038845e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8742737025022507, + "num_tokens": 84314065.0, + "step": 788 + }, + { + "epoch": 1.798175598631699, + "grad_norm": 2.046875, + "learning_rate": 4.774577626154148e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.873917818069458, + "num_tokens": 84421307.0, + "step": 789 + }, + { + "epoch": 1.8004561003420751, + "grad_norm": 2.65625, + "learning_rate": 4.773795389263104e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8776136040687561, + "num_tokens": 84528605.0, + "step": 790 + }, + { + "epoch": 1.8027366020524516, + "grad_norm": 2.34375, + "learning_rate": 4.773011861809694e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.8759381324052811, + "num_tokens": 84635657.0, + "step": 791 + }, + { + "epoch": 1.805017103762828, + "grad_norm": 4.3125, + "learning_rate": 4.772227044238632e-06, + "loss": 0.6173, + "mean_token_accuracy": 0.8727094084024429, + "num_tokens": 84742929.0, + "step": 792 + }, + { + "epoch": 1.8072976054732042, + "grad_norm": 3.375, + "learning_rate": 4.771440936995367e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.876227617263794, + "num_tokens": 84849828.0, + "step": 793 + }, + { + "epoch": 1.8095781071835804, + "grad_norm": 2.671875, + "learning_rate": 4.770653540526079e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.8727654963731766, + "num_tokens": 84956976.0, + "step": 794 + }, + { + "epoch": 1.8118586088939566, + "grad_norm": 2.390625, + "learning_rate": 4.7698648552776785e-06, + "loss": 0.6202, + "mean_token_accuracy": 0.8722630441188812, + "num_tokens": 85063714.0, + "step": 795 + }, + { + "epoch": 1.814139110604333, + "grad_norm": 6.9375, + "learning_rate": 4.769074881697806e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.883360892534256, + "num_tokens": 85171018.0, + "step": 796 + }, + { + "epoch": 1.8164196123147094, + "grad_norm": 8.1875, + "learning_rate": 4.768283620234838e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8781389743089676, + "num_tokens": 85278037.0, + "step": 797 + }, + { + "epoch": 1.8187001140250856, + "grad_norm": 7.28125, + "learning_rate": 4.767491071337877e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.8743828237056732, + "num_tokens": 85385317.0, + "step": 798 + }, + { + "epoch": 1.8209806157354618, + "grad_norm": 3.46875, + "learning_rate": 4.766697235456761e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.8782155364751816, + "num_tokens": 85493351.0, + "step": 799 + }, + { + "epoch": 1.823261117445838, + "grad_norm": 4.3125, + "learning_rate": 4.765902113042053e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.8723445981740952, + "num_tokens": 85599811.0, + "step": 800 + }, + { + "epoch": 1.8255416191562144, + "grad_norm": 4.875, + "learning_rate": 4.765105704545052e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.8751823008060455, + "num_tokens": 85706831.0, + "step": 801 + }, + { + "epoch": 1.8278221208665908, + "grad_norm": 4.5625, + "learning_rate": 4.7643080104177815e-06, + "loss": 0.611, + "mean_token_accuracy": 0.8715473413467407, + "num_tokens": 85813856.0, + "step": 802 + }, + { + "epoch": 1.830102622576967, + "grad_norm": 4.125, + "learning_rate": 4.763509031113e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8734237253665924, + "num_tokens": 85921277.0, + "step": 803 + }, + { + "epoch": 1.8323831242873432, + "grad_norm": 2.5, + "learning_rate": 4.7627087670841894e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.8760107606649399, + "num_tokens": 86028244.0, + "step": 804 + }, + { + "epoch": 1.8346636259977194, + "grad_norm": 3.5625, + "learning_rate": 4.761907218785566e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.8721684813499451, + "num_tokens": 86134885.0, + "step": 805 + }, + { + "epoch": 1.8369441277080958, + "grad_norm": 2.21875, + "learning_rate": 4.761104386672074e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8805957436561584, + "num_tokens": 86242405.0, + "step": 806 + }, + { + "epoch": 1.8392246294184722, + "grad_norm": 3.40625, + "learning_rate": 4.760300271199384e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.875573918223381, + "num_tokens": 86349613.0, + "step": 807 + }, + { + "epoch": 1.8415051311288484, + "grad_norm": 2.375, + "learning_rate": 4.759494872823896e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.8726819604635239, + "num_tokens": 86456544.0, + "step": 808 + }, + { + "epoch": 1.8437856328392246, + "grad_norm": 3.203125, + "learning_rate": 4.758688192002741e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8799534738063812, + "num_tokens": 86563859.0, + "step": 809 + }, + { + "epoch": 1.8460661345496008, + "grad_norm": 2.46875, + "learning_rate": 4.757880229193773e-06, + "loss": 0.599, + "mean_token_accuracy": 0.8771905303001404, + "num_tokens": 86670639.0, + "step": 810 + }, + { + "epoch": 1.8483466362599772, + "grad_norm": 2.125, + "learning_rate": 4.757070984855577e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8817077875137329, + "num_tokens": 86777613.0, + "step": 811 + }, + { + "epoch": 1.8506271379703536, + "grad_norm": 2.640625, + "learning_rate": 4.756260459447465e-06, + "loss": 0.6173, + "mean_token_accuracy": 0.8729428201913834, + "num_tokens": 86884393.0, + "step": 812 + }, + { + "epoch": 1.8529076396807298, + "grad_norm": 3.0, + "learning_rate": 4.755448653429475e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.877841129899025, + "num_tokens": 86991781.0, + "step": 813 + }, + { + "epoch": 1.855188141391106, + "grad_norm": 2.859375, + "learning_rate": 4.754635567262372e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.8776097446680069, + "num_tokens": 87098943.0, + "step": 814 + }, + { + "epoch": 1.8574686431014822, + "grad_norm": 2.171875, + "learning_rate": 4.753821201407648e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8773338198661804, + "num_tokens": 87205996.0, + "step": 815 + }, + { + "epoch": 1.8597491448118586, + "grad_norm": 2.34375, + "learning_rate": 4.7530055563275225e-06, + "loss": 0.6032, + "mean_token_accuracy": 0.8762088716030121, + "num_tokens": 87313545.0, + "step": 816 + }, + { + "epoch": 1.862029646522235, + "grad_norm": 2.75, + "learning_rate": 4.7521886324849385e-06, + "loss": 0.626, + "mean_token_accuracy": 0.8743972480297089, + "num_tokens": 87420224.0, + "step": 817 + }, + { + "epoch": 1.8643101482326112, + "grad_norm": 3.203125, + "learning_rate": 4.751370430343568e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.8746133893728256, + "num_tokens": 87527454.0, + "step": 818 + }, + { + "epoch": 1.8665906499429874, + "grad_norm": 3.484375, + "learning_rate": 4.750550950367805e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.8807315528392792, + "num_tokens": 87634497.0, + "step": 819 + }, + { + "epoch": 1.8688711516533636, + "grad_norm": 2.25, + "learning_rate": 4.749730193022771e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8758885711431503, + "num_tokens": 87741318.0, + "step": 820 + }, + { + "epoch": 1.87115165336374, + "grad_norm": 5.78125, + "learning_rate": 4.748908158774312e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8823642432689667, + "num_tokens": 87848637.0, + "step": 821 + }, + { + "epoch": 1.8734321550741164, + "grad_norm": 2.515625, + "learning_rate": 4.748084848089e-06, + "loss": 0.602, + "mean_token_accuracy": 0.876085102558136, + "num_tokens": 87955464.0, + "step": 822 + }, + { + "epoch": 1.8757126567844926, + "grad_norm": 2.0, + "learning_rate": 4.747260261434128e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8766376227140427, + "num_tokens": 88062827.0, + "step": 823 + }, + { + "epoch": 1.8779931584948688, + "grad_norm": 4.375, + "learning_rate": 4.7464343992777175e-06, + "loss": 0.6083, + "mean_token_accuracy": 0.8759739398956299, + "num_tokens": 88169682.0, + "step": 824 + }, + { + "epoch": 1.880273660205245, + "grad_norm": 3.5625, + "learning_rate": 4.74560726208851e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.8760395795106888, + "num_tokens": 88276642.0, + "step": 825 + }, + { + "epoch": 1.8825541619156214, + "grad_norm": 3.046875, + "learning_rate": 4.744778850335974e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8747592866420746, + "num_tokens": 88383703.0, + "step": 826 + }, + { + "epoch": 1.8848346636259978, + "grad_norm": 2.640625, + "learning_rate": 4.7439491644903e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.8750014901161194, + "num_tokens": 88490924.0, + "step": 827 + }, + { + "epoch": 1.887115165336374, + "grad_norm": 5.5, + "learning_rate": 4.743118205022402e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8755096942186356, + "num_tokens": 88597962.0, + "step": 828 + }, + { + "epoch": 1.8893956670467502, + "grad_norm": 5.25, + "learning_rate": 4.742285972403915e-06, + "loss": 0.6042, + "mean_token_accuracy": 0.8763994574546814, + "num_tokens": 88705068.0, + "step": 829 + }, + { + "epoch": 1.8916761687571264, + "grad_norm": 3.265625, + "learning_rate": 4.7414524671071995e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8812403380870819, + "num_tokens": 88812440.0, + "step": 830 + }, + { + "epoch": 1.8939566704675028, + "grad_norm": 4.9375, + "learning_rate": 4.7406176896053356e-06, + "loss": 0.6033, + "mean_token_accuracy": 0.8769374042749405, + "num_tokens": 88919411.0, + "step": 831 + }, + { + "epoch": 1.8962371721778792, + "grad_norm": 2.171875, + "learning_rate": 4.739781640372129e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.8767253011465073, + "num_tokens": 89026619.0, + "step": 832 + }, + { + "epoch": 1.8985176738882554, + "grad_norm": 3.984375, + "learning_rate": 4.7389443198821035e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8763202428817749, + "num_tokens": 89133774.0, + "step": 833 + }, + { + "epoch": 1.9007981755986316, + "grad_norm": 2.734375, + "learning_rate": 4.738105728610507e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8780044764280319, + "num_tokens": 89240785.0, + "step": 834 + }, + { + "epoch": 1.9030786773090078, + "grad_norm": 2.453125, + "learning_rate": 4.737265867033307e-06, + "loss": 0.6104, + "mean_token_accuracy": 0.876687616109848, + "num_tokens": 89347651.0, + "step": 835 + }, + { + "epoch": 1.9053591790193842, + "grad_norm": 6.125, + "learning_rate": 4.736424735627193e-06, + "loss": 0.5907, + "mean_token_accuracy": 0.8766518086194992, + "num_tokens": 89454086.0, + "step": 836 + }, + { + "epoch": 1.9076396807297606, + "grad_norm": 4.03125, + "learning_rate": 4.735582334869575e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.8772305101156235, + "num_tokens": 89561179.0, + "step": 837 + }, + { + "epoch": 1.9099201824401368, + "grad_norm": 2.875, + "learning_rate": 4.734738665238583e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.8851803094148636, + "num_tokens": 89668499.0, + "step": 838 + }, + { + "epoch": 1.912200684150513, + "grad_norm": 2.484375, + "learning_rate": 4.733893727213068e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8792501837015152, + "num_tokens": 89775580.0, + "step": 839 + }, + { + "epoch": 1.9144811858608894, + "grad_norm": 3.65625, + "learning_rate": 4.7330475212726e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8790338486433029, + "num_tokens": 89883322.0, + "step": 840 + }, + { + "epoch": 1.9167616875712656, + "grad_norm": 4.78125, + "learning_rate": 4.73220004789747e-06, + "loss": 0.6213, + "mean_token_accuracy": 0.8717529475688934, + "num_tokens": 89989995.0, + "step": 841 + }, + { + "epoch": 1.919042189281642, + "grad_norm": 4.90625, + "learning_rate": 4.7313513075686875e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.8774217069149017, + "num_tokens": 90097055.0, + "step": 842 + }, + { + "epoch": 1.9213226909920182, + "grad_norm": 2.15625, + "learning_rate": 4.73050130076798e-06, + "loss": 0.5949, + "mean_token_accuracy": 0.8793853372335434, + "num_tokens": 90204483.0, + "step": 843 + }, + { + "epoch": 1.9236031927023944, + "grad_norm": 3.515625, + "learning_rate": 4.729650027977797e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8808252811431885, + "num_tokens": 90311830.0, + "step": 844 + }, + { + "epoch": 1.9258836944127709, + "grad_norm": 3.046875, + "learning_rate": 4.728797489681302e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.8766701221466064, + "num_tokens": 90418653.0, + "step": 845 + }, + { + "epoch": 1.928164196123147, + "grad_norm": 3.71875, + "learning_rate": 4.7279436863623805e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.880815863609314, + "num_tokens": 90526355.0, + "step": 846 + }, + { + "epoch": 1.9304446978335235, + "grad_norm": 2.625, + "learning_rate": 4.7270886185056355e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8776162266731262, + "num_tokens": 90633735.0, + "step": 847 + }, + { + "epoch": 1.9327251995438997, + "grad_norm": 2.21875, + "learning_rate": 4.726232286596385e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.8745153993368149, + "num_tokens": 90740939.0, + "step": 848 + }, + { + "epoch": 1.9350057012542758, + "grad_norm": 3.625, + "learning_rate": 4.725374691120669e-06, + "loss": 0.6194, + "mean_token_accuracy": 0.8727088868618011, + "num_tokens": 90847747.0, + "step": 849 + }, + { + "epoch": 1.9372862029646523, + "grad_norm": 4.5, + "learning_rate": 4.7245158325652396e-06, + "loss": 0.592, + "mean_token_accuracy": 0.878357782959938, + "num_tokens": 90954915.0, + "step": 850 + }, + { + "epoch": 1.9395667046750285, + "grad_norm": 6.09375, + "learning_rate": 4.7236557114175705e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.8722603023052216, + "num_tokens": 91061821.0, + "step": 851 + }, + { + "epoch": 1.9418472063854049, + "grad_norm": 2.671875, + "learning_rate": 4.722794328165849e-06, + "loss": 0.6068, + "mean_token_accuracy": 0.8738327622413635, + "num_tokens": 91168909.0, + "step": 852 + }, + { + "epoch": 1.944127708095781, + "grad_norm": 3.546875, + "learning_rate": 4.721931683298979e-06, + "loss": 0.597, + "mean_token_accuracy": 0.8767188042402267, + "num_tokens": 91275666.0, + "step": 853 + }, + { + "epoch": 1.9464082098061573, + "grad_norm": 6.75, + "learning_rate": 4.721067777306582e-06, + "loss": 0.6291, + "mean_token_accuracy": 0.8720938414335251, + "num_tokens": 91382425.0, + "step": 854 + }, + { + "epoch": 1.9486887115165337, + "grad_norm": 6.9375, + "learning_rate": 4.7202026106789935e-06, + "loss": 0.6055, + "mean_token_accuracy": 0.87422114610672, + "num_tokens": 91489317.0, + "step": 855 + }, + { + "epoch": 1.95096921322691, + "grad_norm": 4.875, + "learning_rate": 4.719336183907266e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8765899688005447, + "num_tokens": 91596598.0, + "step": 856 + }, + { + "epoch": 1.9532497149372863, + "grad_norm": 2.6875, + "learning_rate": 4.718468497483166e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.8780688494443893, + "num_tokens": 91703876.0, + "step": 857 + }, + { + "epoch": 1.9555302166476625, + "grad_norm": 3.5, + "learning_rate": 4.717599551899177e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8795433938503265, + "num_tokens": 91810908.0, + "step": 858 + }, + { + "epoch": 1.9578107183580387, + "grad_norm": 6.5, + "learning_rate": 4.716729347648494e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.8784368187189102, + "num_tokens": 91918098.0, + "step": 859 + }, + { + "epoch": 1.960091220068415, + "grad_norm": 3.59375, + "learning_rate": 4.71585788522503e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8751647025346756, + "num_tokens": 92025449.0, + "step": 860 + }, + { + "epoch": 1.9623717217787915, + "grad_norm": 3.546875, + "learning_rate": 4.7149851651234085e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8768515288829803, + "num_tokens": 92132495.0, + "step": 861 + }, + { + "epoch": 1.9646522234891677, + "grad_norm": 6.25, + "learning_rate": 4.714111187838969e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.8760241121053696, + "num_tokens": 92240414.0, + "step": 862 + }, + { + "epoch": 1.9669327251995439, + "grad_norm": 2.421875, + "learning_rate": 4.713235953867764e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8732226341962814, + "num_tokens": 92346560.0, + "step": 863 + }, + { + "epoch": 1.96921322690992, + "grad_norm": 3.078125, + "learning_rate": 4.712359463706561e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8731984049081802, + "num_tokens": 92453339.0, + "step": 864 + }, + { + "epoch": 1.9714937286202965, + "grad_norm": 3.78125, + "learning_rate": 4.711481717852837e-06, + "loss": 0.6093, + "mean_token_accuracy": 0.8733879327774048, + "num_tokens": 92560826.0, + "step": 865 + }, + { + "epoch": 1.973774230330673, + "grad_norm": 3.546875, + "learning_rate": 4.710602716804784e-06, + "loss": 0.614, + "mean_token_accuracy": 0.8723070323467255, + "num_tokens": 92667923.0, + "step": 866 + }, + { + "epoch": 1.976054732041049, + "grad_norm": 2.78125, + "learning_rate": 4.709722461061307e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.8759769946336746, + "num_tokens": 92774822.0, + "step": 867 + }, + { + "epoch": 1.9783352337514253, + "grad_norm": 2.984375, + "learning_rate": 4.70884095112202e-06, + "loss": 0.5922, + "mean_token_accuracy": 0.8767600506544113, + "num_tokens": 92881978.0, + "step": 868 + }, + { + "epoch": 1.9806157354618015, + "grad_norm": 4.84375, + "learning_rate": 4.707958187487254e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.8756605833768845, + "num_tokens": 92989352.0, + "step": 869 + }, + { + "epoch": 1.982896237172178, + "grad_norm": 3.78125, + "learning_rate": 4.707074170658046e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.8736278414726257, + "num_tokens": 93095509.0, + "step": 870 + }, + { + "epoch": 1.9851767388825543, + "grad_norm": 2.96875, + "learning_rate": 4.706188901136148e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8808690756559372, + "num_tokens": 93203007.0, + "step": 871 + }, + { + "epoch": 1.9874572405929305, + "grad_norm": 2.28125, + "learning_rate": 4.705302379424023e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.8731043189764023, + "num_tokens": 93310443.0, + "step": 872 + }, + { + "epoch": 1.9897377423033067, + "grad_norm": 4.25, + "learning_rate": 4.704414606024842e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.8732631206512451, + "num_tokens": 93416826.0, + "step": 873 + }, + { + "epoch": 1.9920182440136829, + "grad_norm": 2.09375, + "learning_rate": 4.703525581442488e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8822022676467896, + "num_tokens": 93523746.0, + "step": 874 + }, + { + "epoch": 1.9942987457240593, + "grad_norm": 3.5, + "learning_rate": 4.702635306181554e-06, + "loss": 0.5902, + "mean_token_accuracy": 0.8761956989765167, + "num_tokens": 93630669.0, + "step": 875 + }, + { + "epoch": 1.9965792474344357, + "grad_norm": 2.421875, + "learning_rate": 4.701743780747345e-06, + "loss": 0.6131, + "mean_token_accuracy": 0.8748619705438614, + "num_tokens": 93738175.0, + "step": 876 + }, + { + "epoch": 1.998859749144812, + "grad_norm": 4.1875, + "learning_rate": 4.700851005645872e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.8752595335245132, + "num_tokens": 93845319.0, + "step": 877 + }, + { + "epoch": 2.0, + "grad_norm": 5.40625, + "learning_rate": 4.699956981383857e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8865090310573578, + "num_tokens": 93884464.0, + "step": 878 + }, + { + "epoch": 2.002280501710376, + "grad_norm": 2.625, + "learning_rate": 4.699061708468732e-06, + "loss": 0.604, + "mean_token_accuracy": 0.8751733750104904, + "num_tokens": 93991225.0, + "step": 879 + }, + { + "epoch": 2.0045610034207524, + "grad_norm": 2.390625, + "learning_rate": 4.698165187408635e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8786071985960007, + "num_tokens": 94098859.0, + "step": 880 + }, + { + "epoch": 2.0045610034207524, + "eval_loss": 0.6056435108184814, + "eval_mean_token_accuracy": 0.8761785948231193, + "eval_num_tokens": 94098859.0, + "eval_runtime": 58.5991, + "eval_samples_per_second": 143.091, + "eval_steps_per_second": 4.488, + "step": 880 + }, + { + "epoch": 2.006841505131129, + "grad_norm": 2.640625, + "learning_rate": 4.697267418712415e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8779609203338623, + "num_tokens": 94205931.0, + "step": 881 + }, + { + "epoch": 2.009122006841505, + "grad_norm": 2.734375, + "learning_rate": 4.6963684028896285e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.877967357635498, + "num_tokens": 94313152.0, + "step": 882 + }, + { + "epoch": 2.0114025085518814, + "grad_norm": 2.0625, + "learning_rate": 4.695468140450539e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8791947513818741, + "num_tokens": 94420319.0, + "step": 883 + }, + { + "epoch": 2.0136830102622576, + "grad_norm": 4.875, + "learning_rate": 4.6945666319061166e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8798933327198029, + "num_tokens": 94527420.0, + "step": 884 + }, + { + "epoch": 2.015963511972634, + "grad_norm": 2.859375, + "learning_rate": 4.6936638777680435e-06, + "loss": 0.5986, + "mean_token_accuracy": 0.8755367547273636, + "num_tokens": 94634228.0, + "step": 885 + }, + { + "epoch": 2.0182440136830104, + "grad_norm": 1.9140625, + "learning_rate": 4.6927598785487026e-06, + "loss": 0.6088, + "mean_token_accuracy": 0.873708963394165, + "num_tokens": 94740813.0, + "step": 886 + }, + { + "epoch": 2.0205245153933866, + "grad_norm": 3.25, + "learning_rate": 4.691854634761188e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.8793173581361771, + "num_tokens": 94848566.0, + "step": 887 + }, + { + "epoch": 2.022805017103763, + "grad_norm": 3.015625, + "learning_rate": 4.690948146919299e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8790719360113144, + "num_tokens": 94955659.0, + "step": 888 + }, + { + "epoch": 2.025085518814139, + "grad_norm": 4.53125, + "learning_rate": 4.690040415537538e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8785732984542847, + "num_tokens": 95062760.0, + "step": 889 + }, + { + "epoch": 2.027366020524515, + "grad_norm": 3.5, + "learning_rate": 4.689131441131119e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8747037500143051, + "num_tokens": 95170435.0, + "step": 890 + }, + { + "epoch": 2.029646522234892, + "grad_norm": 2.34375, + "learning_rate": 4.6882212242159555e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.8810647279024124, + "num_tokens": 95277421.0, + "step": 891 + }, + { + "epoch": 2.031927023945268, + "grad_norm": 2.296875, + "learning_rate": 4.687309765308671e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.881627082824707, + "num_tokens": 95384868.0, + "step": 892 + }, + { + "epoch": 2.034207525655644, + "grad_norm": 2.015625, + "learning_rate": 4.6863970649265914e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8783137798309326, + "num_tokens": 95492292.0, + "step": 893 + }, + { + "epoch": 2.0364880273660204, + "grad_norm": 5.78125, + "learning_rate": 4.685483123587748e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8808389455080032, + "num_tokens": 95599389.0, + "step": 894 + }, + { + "epoch": 2.0387685290763966, + "grad_norm": 2.96875, + "learning_rate": 4.684567941810876e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.8855313509702682, + "num_tokens": 95706662.0, + "step": 895 + }, + { + "epoch": 2.0410490307867732, + "grad_norm": 2.796875, + "learning_rate": 4.683651520115414e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.8826097548007965, + "num_tokens": 95814265.0, + "step": 896 + }, + { + "epoch": 2.0433295324971494, + "grad_norm": 2.90625, + "learning_rate": 4.682733859021508e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8777095377445221, + "num_tokens": 95921432.0, + "step": 897 + }, + { + "epoch": 2.0456100342075256, + "grad_norm": 5.71875, + "learning_rate": 4.681814959050002e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.8748744130134583, + "num_tokens": 96029029.0, + "step": 898 + }, + { + "epoch": 2.047890535917902, + "grad_norm": 5.65625, + "learning_rate": 4.680894820722446e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8802212625741959, + "num_tokens": 96136531.0, + "step": 899 + }, + { + "epoch": 2.050171037628278, + "grad_norm": 6.875, + "learning_rate": 4.679973444561095e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.873874619603157, + "num_tokens": 96243395.0, + "step": 900 + }, + { + "epoch": 2.0524515393386547, + "grad_norm": 3.640625, + "learning_rate": 4.679050831088902e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8777193129062653, + "num_tokens": 96350083.0, + "step": 901 + }, + { + "epoch": 2.054732041049031, + "grad_norm": 2.109375, + "learning_rate": 4.678126980829525e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8807910680770874, + "num_tokens": 96457480.0, + "step": 902 + }, + { + "epoch": 2.057012542759407, + "grad_norm": 5.0, + "learning_rate": 4.677201894307325e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.872836172580719, + "num_tokens": 96564282.0, + "step": 903 + }, + { + "epoch": 2.0592930444697832, + "grad_norm": 4.75, + "learning_rate": 4.676275572047362e-06, + "loss": 0.6123, + "mean_token_accuracy": 0.871756300330162, + "num_tokens": 96671356.0, + "step": 904 + }, + { + "epoch": 2.0615735461801594, + "grad_norm": 4.6875, + "learning_rate": 4.675348014575399e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.8757220953702927, + "num_tokens": 96778413.0, + "step": 905 + }, + { + "epoch": 2.063854047890536, + "grad_norm": 2.28125, + "learning_rate": 4.674419222417899e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.8768678158521652, + "num_tokens": 96885296.0, + "step": 906 + }, + { + "epoch": 2.0661345496009123, + "grad_norm": 2.09375, + "learning_rate": 4.673489196102028e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.874673992395401, + "num_tokens": 96992919.0, + "step": 907 + }, + { + "epoch": 2.0684150513112884, + "grad_norm": 2.328125, + "learning_rate": 4.67255793615565e-06, + "loss": 0.6077, + "mean_token_accuracy": 0.8760568499565125, + "num_tokens": 97099795.0, + "step": 908 + }, + { + "epoch": 2.0706955530216646, + "grad_norm": 4.90625, + "learning_rate": 4.67162544310733e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8787839561700821, + "num_tokens": 97206999.0, + "step": 909 + }, + { + "epoch": 2.072976054732041, + "grad_norm": 7.03125, + "learning_rate": 4.670691717486333e-06, + "loss": 0.5961, + "mean_token_accuracy": 0.8767089992761612, + "num_tokens": 97314451.0, + "step": 910 + }, + { + "epoch": 2.0752565564424175, + "grad_norm": 5.84375, + "learning_rate": 4.669756759822625e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8748338222503662, + "num_tokens": 97421703.0, + "step": 911 + }, + { + "epoch": 2.0775370581527937, + "grad_norm": 2.15625, + "learning_rate": 4.668820570646868e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.8794323354959488, + "num_tokens": 97528420.0, + "step": 912 + }, + { + "epoch": 2.07981755986317, + "grad_norm": 2.484375, + "learning_rate": 4.667883150490427e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8755746185779572, + "num_tokens": 97634954.0, + "step": 913 + }, + { + "epoch": 2.082098061573546, + "grad_norm": 3.3125, + "learning_rate": 4.666944499885361e-06, + "loss": 0.6083, + "mean_token_accuracy": 0.8736224919557571, + "num_tokens": 97742085.0, + "step": 914 + }, + { + "epoch": 2.0843785632839227, + "grad_norm": 2.484375, + "learning_rate": 4.6660046193644315e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8764249086380005, + "num_tokens": 97849138.0, + "step": 915 + }, + { + "epoch": 2.086659064994299, + "grad_norm": 2.671875, + "learning_rate": 4.665063509461098e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8795299530029297, + "num_tokens": 97956499.0, + "step": 916 + }, + { + "epoch": 2.088939566704675, + "grad_norm": 2.21875, + "learning_rate": 4.664121170709512e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.8816528022289276, + "num_tokens": 98063712.0, + "step": 917 + }, + { + "epoch": 2.0912200684150513, + "grad_norm": 2.6875, + "learning_rate": 4.663177603644532e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8774126917123795, + "num_tokens": 98170731.0, + "step": 918 + }, + { + "epoch": 2.0935005701254275, + "grad_norm": 2.921875, + "learning_rate": 4.662232808801704e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.8704886138439178, + "num_tokens": 98277880.0, + "step": 919 + }, + { + "epoch": 2.095781071835804, + "grad_norm": 4.0, + "learning_rate": 4.661286786717278e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8788218349218369, + "num_tokens": 98385335.0, + "step": 920 + }, + { + "epoch": 2.0980615735461803, + "grad_norm": 3.234375, + "learning_rate": 4.660339537928198e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8791757225990295, + "num_tokens": 98492150.0, + "step": 921 + }, + { + "epoch": 2.1003420752565565, + "grad_norm": 2.046875, + "learning_rate": 4.659391062972102e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8784543722867966, + "num_tokens": 98599554.0, + "step": 922 + }, + { + "epoch": 2.1026225769669327, + "grad_norm": 2.265625, + "learning_rate": 4.658441362387328e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8808614313602448, + "num_tokens": 98706337.0, + "step": 923 + }, + { + "epoch": 2.104903078677309, + "grad_norm": 3.515625, + "learning_rate": 4.657490436712907e-06, + "loss": 0.62, + "mean_token_accuracy": 0.8705125451087952, + "num_tokens": 98812884.0, + "step": 924 + }, + { + "epoch": 2.1071835803876855, + "grad_norm": 3.796875, + "learning_rate": 4.6565382864885665e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8783458173274994, + "num_tokens": 98920205.0, + "step": 925 + }, + { + "epoch": 2.1094640820980617, + "grad_norm": 2.3125, + "learning_rate": 4.655584912254727e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8786965757608414, + "num_tokens": 99027392.0, + "step": 926 + }, + { + "epoch": 2.111744583808438, + "grad_norm": 3.09375, + "learning_rate": 4.654630314552508e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8778886198997498, + "num_tokens": 99134550.0, + "step": 927 + }, + { + "epoch": 2.114025085518814, + "grad_norm": 3.390625, + "learning_rate": 4.653674493923718e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8823033571243286, + "num_tokens": 99241929.0, + "step": 928 + }, + { + "epoch": 2.1163055872291903, + "grad_norm": 2.578125, + "learning_rate": 4.652717450910864e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.8728591203689575, + "num_tokens": 99349158.0, + "step": 929 + }, + { + "epoch": 2.118586088939567, + "grad_norm": 2.46875, + "learning_rate": 4.651759186057144e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.87718665599823, + "num_tokens": 99456113.0, + "step": 930 + }, + { + "epoch": 2.120866590649943, + "grad_norm": 2.640625, + "learning_rate": 4.650799699906452e-06, + "loss": 0.6052, + "mean_token_accuracy": 0.8780839145183563, + "num_tokens": 99563254.0, + "step": 931 + }, + { + "epoch": 2.1231470923603193, + "grad_norm": 2.640625, + "learning_rate": 4.649838993003373e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8814170211553574, + "num_tokens": 99670632.0, + "step": 932 + }, + { + "epoch": 2.1254275940706955, + "grad_norm": 4.625, + "learning_rate": 4.648877065893186e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8802737295627594, + "num_tokens": 99777773.0, + "step": 933 + }, + { + "epoch": 2.1277080957810717, + "grad_norm": 3.5625, + "learning_rate": 4.647913919121861e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8730404675006866, + "num_tokens": 99885179.0, + "step": 934 + }, + { + "epoch": 2.1299885974914483, + "grad_norm": 2.21875, + "learning_rate": 4.646949553236064e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8850271999835968, + "num_tokens": 99992436.0, + "step": 935 + }, + { + "epoch": 2.1322690992018245, + "grad_norm": 4.75, + "learning_rate": 4.645983968783148e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8796833902597427, + "num_tokens": 100099422.0, + "step": 936 + }, + { + "epoch": 2.1345496009122007, + "grad_norm": 5.9375, + "learning_rate": 4.645017166311163e-06, + "loss": 0.596, + "mean_token_accuracy": 0.8761509507894516, + "num_tokens": 100207243.0, + "step": 937 + }, + { + "epoch": 2.136830102622577, + "grad_norm": 2.875, + "learning_rate": 4.644049146368844e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.8749994188547134, + "num_tokens": 100313796.0, + "step": 938 + }, + { + "epoch": 2.139110604332953, + "grad_norm": 1.9609375, + "learning_rate": 4.643079909505622e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8829146772623062, + "num_tokens": 100420992.0, + "step": 939 + }, + { + "epoch": 2.1413911060433297, + "grad_norm": 3.078125, + "learning_rate": 4.642109456271618e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8793570548295975, + "num_tokens": 100528158.0, + "step": 940 + }, + { + "epoch": 2.143671607753706, + "grad_norm": 4.15625, + "learning_rate": 4.64113778721764e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.8770193010568619, + "num_tokens": 100635032.0, + "step": 941 + }, + { + "epoch": 2.145952109464082, + "grad_norm": 5.4375, + "learning_rate": 4.640164902895192e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8772297501564026, + "num_tokens": 100742739.0, + "step": 942 + }, + { + "epoch": 2.1482326111744583, + "grad_norm": 2.5, + "learning_rate": 4.6391908038564615e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.8756915777921677, + "num_tokens": 100849851.0, + "step": 943 + }, + { + "epoch": 2.1505131128848345, + "grad_norm": 6.5, + "learning_rate": 4.6382154906543295e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.8732953071594238, + "num_tokens": 100957153.0, + "step": 944 + }, + { + "epoch": 2.152793614595211, + "grad_norm": 5.3125, + "learning_rate": 4.637238963842365e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.8774750828742981, + "num_tokens": 101063587.0, + "step": 945 + }, + { + "epoch": 2.1550741163055873, + "grad_norm": 3.96875, + "learning_rate": 4.636261223974826e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8747821748256683, + "num_tokens": 101170370.0, + "step": 946 + }, + { + "epoch": 2.1573546180159635, + "grad_norm": 3.09375, + "learning_rate": 4.635282271606658e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8776073157787323, + "num_tokens": 101277426.0, + "step": 947 + }, + { + "epoch": 2.1596351197263397, + "grad_norm": 2.5, + "learning_rate": 4.634302107293497e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.876614436507225, + "num_tokens": 101383939.0, + "step": 948 + }, + { + "epoch": 2.161915621436716, + "grad_norm": 2.28125, + "learning_rate": 4.633320731591663e-06, + "loss": 0.605, + "mean_token_accuracy": 0.8757250159978867, + "num_tokens": 101491379.0, + "step": 949 + }, + { + "epoch": 2.1641961231470925, + "grad_norm": 4.34375, + "learning_rate": 4.632338145058167e-06, + "loss": 0.6244, + "mean_token_accuracy": 0.8718846142292023, + "num_tokens": 101598608.0, + "step": 950 + }, + { + "epoch": 2.1664766248574687, + "grad_norm": 1.96875, + "learning_rate": 4.631354348250706e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8830228000879288, + "num_tokens": 101706020.0, + "step": 951 + }, + { + "epoch": 2.168757126567845, + "grad_norm": 4.40625, + "learning_rate": 4.630369341727665e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.872904360294342, + "num_tokens": 101812698.0, + "step": 952 + }, + { + "epoch": 2.171037628278221, + "grad_norm": 4.3125, + "learning_rate": 4.629383126048114e-06, + "loss": 0.608, + "mean_token_accuracy": 0.8747462183237076, + "num_tokens": 101919836.0, + "step": 953 + }, + { + "epoch": 2.1733181299885973, + "grad_norm": 4.875, + "learning_rate": 4.6283957017718105e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8803711831569672, + "num_tokens": 102027244.0, + "step": 954 + }, + { + "epoch": 2.175598631698974, + "grad_norm": 1.96875, + "learning_rate": 4.627407069459196e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.8786141574382782, + "num_tokens": 102134050.0, + "step": 955 + }, + { + "epoch": 2.17787913340935, + "grad_norm": 3.8125, + "learning_rate": 4.626417229671401e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8817842751741409, + "num_tokens": 102241872.0, + "step": 956 + }, + { + "epoch": 2.1801596351197263, + "grad_norm": 4.28125, + "learning_rate": 4.625426182970237e-06, + "loss": 0.6055, + "mean_token_accuracy": 0.8747357130050659, + "num_tokens": 102348674.0, + "step": 957 + }, + { + "epoch": 2.1824401368301025, + "grad_norm": 3.796875, + "learning_rate": 4.6244339299182065e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.8756296783685684, + "num_tokens": 102455493.0, + "step": 958 + }, + { + "epoch": 2.1847206385404787, + "grad_norm": 2.671875, + "learning_rate": 4.62344047107849e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8777966946363449, + "num_tokens": 102562617.0, + "step": 959 + }, + { + "epoch": 2.1870011402508553, + "grad_norm": 3.90625, + "learning_rate": 4.622445807014956e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8824858963489532, + "num_tokens": 102670281.0, + "step": 960 + }, + { + "epoch": 2.1892816419612315, + "grad_norm": 3.125, + "learning_rate": 4.621449938292159e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8756995797157288, + "num_tokens": 102778053.0, + "step": 961 + }, + { + "epoch": 2.1915621436716077, + "grad_norm": 4.46875, + "learning_rate": 4.620452865475331e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8776930570602417, + "num_tokens": 102885553.0, + "step": 962 + }, + { + "epoch": 2.193842645381984, + "grad_norm": 2.46875, + "learning_rate": 4.6194545891303955e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.8760574162006378, + "num_tokens": 102992589.0, + "step": 963 + }, + { + "epoch": 2.19612314709236, + "grad_norm": 6.25, + "learning_rate": 4.618455109823952e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.87519970536232, + "num_tokens": 103099395.0, + "step": 964 + }, + { + "epoch": 2.1984036488027368, + "grad_norm": 3.84375, + "learning_rate": 4.617454428123287e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.8743910491466522, + "num_tokens": 103205910.0, + "step": 965 + }, + { + "epoch": 2.200684150513113, + "grad_norm": 5.21875, + "learning_rate": 4.616452544596367e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8808601945638657, + "num_tokens": 103313312.0, + "step": 966 + }, + { + "epoch": 2.202964652223489, + "grad_norm": 2.25, + "learning_rate": 4.615449459811843e-06, + "loss": 0.587, + "mean_token_accuracy": 0.882127583026886, + "num_tokens": 103420313.0, + "step": 967 + }, + { + "epoch": 2.2052451539338653, + "grad_norm": 2.4375, + "learning_rate": 4.614445174339045e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8822465240955353, + "num_tokens": 103527336.0, + "step": 968 + }, + { + "epoch": 2.2075256556442415, + "grad_norm": 6.46875, + "learning_rate": 4.613439688747988e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.8785429149866104, + "num_tokens": 103634750.0, + "step": 969 + }, + { + "epoch": 2.209806157354618, + "grad_norm": 6.09375, + "learning_rate": 4.612433003609365e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.8751912713050842, + "num_tokens": 103741472.0, + "step": 970 + }, + { + "epoch": 2.2120866590649944, + "grad_norm": 3.515625, + "learning_rate": 4.611425119494552e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.8733502626419067, + "num_tokens": 103848855.0, + "step": 971 + }, + { + "epoch": 2.2143671607753705, + "grad_norm": 2.53125, + "learning_rate": 4.6104160369756025e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8768988847732544, + "num_tokens": 103956579.0, + "step": 972 + }, + { + "epoch": 2.2166476624857467, + "grad_norm": 3.578125, + "learning_rate": 4.609405756625254e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8784401416778564, + "num_tokens": 104063842.0, + "step": 973 + }, + { + "epoch": 2.2189281641961234, + "grad_norm": 3.109375, + "learning_rate": 4.608394279016921e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8794825226068497, + "num_tokens": 104171052.0, + "step": 974 + }, + { + "epoch": 2.2212086659064996, + "grad_norm": 3.3125, + "learning_rate": 4.6073816047247e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8819704353809357, + "num_tokens": 104278607.0, + "step": 975 + }, + { + "epoch": 2.2234891676168758, + "grad_norm": 2.265625, + "learning_rate": 4.606367734323365e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8778064996004105, + "num_tokens": 104385870.0, + "step": 976 + }, + { + "epoch": 2.225769669327252, + "grad_norm": 3.109375, + "learning_rate": 4.605352668388369e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8713217675685883, + "num_tokens": 104493565.0, + "step": 977 + }, + { + "epoch": 2.228050171037628, + "grad_norm": 3.34375, + "learning_rate": 4.6043364074958435e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8773491680622101, + "num_tokens": 104600945.0, + "step": 978 + }, + { + "epoch": 2.2303306727480043, + "grad_norm": 2.6875, + "learning_rate": 4.6033189522226e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.8754960596561432, + "num_tokens": 104707727.0, + "step": 979 + }, + { + "epoch": 2.232611174458381, + "grad_norm": 2.5625, + "learning_rate": 4.602300303146123e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8821861445903778, + "num_tokens": 104815334.0, + "step": 980 + }, + { + "epoch": 2.234891676168757, + "grad_norm": 2.140625, + "learning_rate": 4.601280460844583e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.8821869492530823, + "num_tokens": 104922027.0, + "step": 981 + }, + { + "epoch": 2.2371721778791334, + "grad_norm": 3.125, + "learning_rate": 4.6002594258968185e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8820158392190933, + "num_tokens": 105028843.0, + "step": 982 + }, + { + "epoch": 2.2394526795895096, + "grad_norm": 3.546875, + "learning_rate": 4.599237198882351e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.8787632882595062, + "num_tokens": 105135688.0, + "step": 983 + }, + { + "epoch": 2.241733181299886, + "grad_norm": 2.875, + "learning_rate": 4.598213780381377e-06, + "loss": 0.5986, + "mean_token_accuracy": 0.8801819086074829, + "num_tokens": 105242376.0, + "step": 984 + }, + { + "epoch": 2.2440136830102624, + "grad_norm": 3.046875, + "learning_rate": 4.59718917097477e-06, + "loss": 0.5932, + "mean_token_accuracy": 0.8789917528629303, + "num_tokens": 105349325.0, + "step": 985 + }, + { + "epoch": 2.2462941847206386, + "grad_norm": 5.8125, + "learning_rate": 4.596163371244076e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.8811417818069458, + "num_tokens": 105456213.0, + "step": 986 + }, + { + "epoch": 2.2485746864310148, + "grad_norm": 7.59375, + "learning_rate": 4.595136381771521e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.8762593269348145, + "num_tokens": 105562679.0, + "step": 987 + }, + { + "epoch": 2.250855188141391, + "grad_norm": 7.8125, + "learning_rate": 4.594108203140004e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.8737095445394516, + "num_tokens": 105670038.0, + "step": 988 + }, + { + "epoch": 2.253135689851767, + "grad_norm": 3.65625, + "learning_rate": 4.593078835933099e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8798780590295792, + "num_tokens": 105777357.0, + "step": 989 + }, + { + "epoch": 2.255416191562144, + "grad_norm": 2.4375, + "learning_rate": 4.592048280735055e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.879479706287384, + "num_tokens": 105884657.0, + "step": 990 + }, + { + "epoch": 2.25769669327252, + "grad_norm": 5.71875, + "learning_rate": 4.591016538130796e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8783192932605743, + "num_tokens": 105991145.0, + "step": 991 + }, + { + "epoch": 2.259977194982896, + "grad_norm": 5.59375, + "learning_rate": 4.589983608705918e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.8753475993871689, + "num_tokens": 106097808.0, + "step": 992 + }, + { + "epoch": 2.2622576966932724, + "grad_norm": 5.78125, + "learning_rate": 4.588949493046693e-06, + "loss": 0.628, + "mean_token_accuracy": 0.8707805275917053, + "num_tokens": 106204975.0, + "step": 993 + }, + { + "epoch": 2.264538198403649, + "grad_norm": 4.375, + "learning_rate": 4.587914191740064e-06, + "loss": 0.6265, + "mean_token_accuracy": 0.8718110471963882, + "num_tokens": 106311506.0, + "step": 994 + }, + { + "epoch": 2.266818700114025, + "grad_norm": 2.765625, + "learning_rate": 4.586877705373648e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.8814664483070374, + "num_tokens": 106418583.0, + "step": 995 + }, + { + "epoch": 2.2690992018244014, + "grad_norm": 3.625, + "learning_rate": 4.585840034535736e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.881786435842514, + "num_tokens": 106526615.0, + "step": 996 + }, + { + "epoch": 2.2713797035347776, + "grad_norm": 7.40625, + "learning_rate": 4.584801179815289e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.878081277012825, + "num_tokens": 106633800.0, + "step": 997 + }, + { + "epoch": 2.2736602052451538, + "grad_norm": 7.59375, + "learning_rate": 4.583761141801941e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.8735693991184235, + "num_tokens": 106740741.0, + "step": 998 + }, + { + "epoch": 2.27594070695553, + "grad_norm": 5.0625, + "learning_rate": 4.5827199210859975e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8779504746198654, + "num_tokens": 106847997.0, + "step": 999 + }, + { + "epoch": 2.2782212086659066, + "grad_norm": 5.4375, + "learning_rate": 4.581677518258435e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.8708099573850632, + "num_tokens": 106955093.0, + "step": 1000 + }, + { + "epoch": 2.280501710376283, + "grad_norm": 5.25, + "learning_rate": 4.580633933910901e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.881976768374443, + "num_tokens": 107062229.0, + "step": 1001 + }, + { + "epoch": 2.282782212086659, + "grad_norm": 5.5625, + "learning_rate": 4.579589168635715e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.8718415945768356, + "num_tokens": 107168673.0, + "step": 1002 + }, + { + "epoch": 2.285062713797035, + "grad_norm": 3.765625, + "learning_rate": 4.578543223025865e-06, + "loss": 0.6294, + "mean_token_accuracy": 0.8693426996469498, + "num_tokens": 107275384.0, + "step": 1003 + }, + { + "epoch": 2.287343215507412, + "grad_norm": 3.265625, + "learning_rate": 4.577496097675009e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8758938610553741, + "num_tokens": 107382155.0, + "step": 1004 + }, + { + "epoch": 2.289623717217788, + "grad_norm": 3.03125, + "learning_rate": 4.576447793177476e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.8743392378091812, + "num_tokens": 107488845.0, + "step": 1005 + }, + { + "epoch": 2.291904218928164, + "grad_norm": 2.1875, + "learning_rate": 4.575398310128263e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.8819352090358734, + "num_tokens": 107596244.0, + "step": 1006 + }, + { + "epoch": 2.2941847206385404, + "grad_norm": 3.140625, + "learning_rate": 4.574347649123036e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.8723960518836975, + "num_tokens": 107703259.0, + "step": 1007 + }, + { + "epoch": 2.2964652223489166, + "grad_norm": 2.5625, + "learning_rate": 4.57329581075813e-06, + "loss": 0.626, + "mean_token_accuracy": 0.8719481080770493, + "num_tokens": 107810343.0, + "step": 1008 + }, + { + "epoch": 2.2987457240592932, + "grad_norm": 5.1875, + "learning_rate": 4.572242795630549e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8785750865936279, + "num_tokens": 107917748.0, + "step": 1009 + }, + { + "epoch": 2.3010262257696694, + "grad_norm": 2.609375, + "learning_rate": 4.571188604337963e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8774587363004684, + "num_tokens": 108025056.0, + "step": 1010 + }, + { + "epoch": 2.3033067274800456, + "grad_norm": 2.09375, + "learning_rate": 4.570133237478711e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8772729784250259, + "num_tokens": 108131874.0, + "step": 1011 + }, + { + "epoch": 2.305587229190422, + "grad_norm": 3.765625, + "learning_rate": 4.5690766956517985e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.883285716176033, + "num_tokens": 108238676.0, + "step": 1012 + }, + { + "epoch": 2.307867730900798, + "grad_norm": 4.8125, + "learning_rate": 4.568018979456899e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.87696273624897, + "num_tokens": 108345907.0, + "step": 1013 + }, + { + "epoch": 2.3101482326111746, + "grad_norm": 5.46875, + "learning_rate": 4.566960089494351e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8753390312194824, + "num_tokens": 108453246.0, + "step": 1014 + }, + { + "epoch": 2.312428734321551, + "grad_norm": 2.484375, + "learning_rate": 4.5659000263651615e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8795862644910812, + "num_tokens": 108560344.0, + "step": 1015 + }, + { + "epoch": 2.314709236031927, + "grad_norm": 2.46875, + "learning_rate": 4.564838790671e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8801321387290955, + "num_tokens": 108667748.0, + "step": 1016 + }, + { + "epoch": 2.316989737742303, + "grad_norm": 3.90625, + "learning_rate": 4.5637763830142046e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8832177221775055, + "num_tokens": 108775087.0, + "step": 1017 + }, + { + "epoch": 2.3192702394526794, + "grad_norm": 4.96875, + "learning_rate": 4.562712803997776e-06, + "loss": 0.6119, + "mean_token_accuracy": 0.8724011331796646, + "num_tokens": 108882040.0, + "step": 1018 + }, + { + "epoch": 2.321550741163056, + "grad_norm": 2.203125, + "learning_rate": 4.5616480542253825e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.8780981302261353, + "num_tokens": 108988642.0, + "step": 1019 + }, + { + "epoch": 2.3238312428734322, + "grad_norm": 2.59375, + "learning_rate": 4.5605821343013555e-06, + "loss": 0.61, + "mean_token_accuracy": 0.8755066245794296, + "num_tokens": 109096022.0, + "step": 1020 + }, + { + "epoch": 2.3261117445838084, + "grad_norm": 3.796875, + "learning_rate": 4.55951504483069e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.878739207983017, + "num_tokens": 109203309.0, + "step": 1021 + }, + { + "epoch": 2.3283922462941846, + "grad_norm": 2.71875, + "learning_rate": 4.558446786419045e-06, + "loss": 0.599, + "mean_token_accuracy": 0.8794536143541336, + "num_tokens": 109309812.0, + "step": 1022 + }, + { + "epoch": 2.330672748004561, + "grad_norm": 2.296875, + "learning_rate": 4.557377359672745e-06, + "loss": 0.5817, + "mean_token_accuracy": 0.8791648596525192, + "num_tokens": 109416293.0, + "step": 1023 + }, + { + "epoch": 2.3329532497149374, + "grad_norm": 2.65625, + "learning_rate": 4.556306765198775e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.8814702332019806, + "num_tokens": 109523500.0, + "step": 1024 + }, + { + "epoch": 2.3352337514253136, + "grad_norm": 2.140625, + "learning_rate": 4.555235003604782e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8794472515583038, + "num_tokens": 109630312.0, + "step": 1025 + }, + { + "epoch": 2.33751425313569, + "grad_norm": 4.65625, + "learning_rate": 4.55416207549908e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8786284774541855, + "num_tokens": 109737301.0, + "step": 1026 + }, + { + "epoch": 2.339794754846066, + "grad_norm": 4.625, + "learning_rate": 4.5530879814906404e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.8746591061353683, + "num_tokens": 109844193.0, + "step": 1027 + }, + { + "epoch": 2.342075256556442, + "grad_norm": 2.5, + "learning_rate": 4.5520127221891e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.8710612952709198, + "num_tokens": 109951493.0, + "step": 1028 + }, + { + "epoch": 2.344355758266819, + "grad_norm": 2.640625, + "learning_rate": 4.5509362982047525e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.8757833987474442, + "num_tokens": 110058293.0, + "step": 1029 + }, + { + "epoch": 2.346636259977195, + "grad_norm": 2.828125, + "learning_rate": 4.549858710148558e-06, + "loss": 0.592, + "mean_token_accuracy": 0.876603439450264, + "num_tokens": 110165755.0, + "step": 1030 + }, + { + "epoch": 2.3489167616875712, + "grad_norm": 4.125, + "learning_rate": 4.548779958632134e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8827737718820572, + "num_tokens": 110273020.0, + "step": 1031 + }, + { + "epoch": 2.3511972633979474, + "grad_norm": 2.84375, + "learning_rate": 4.5477000442677575e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.8793087154626846, + "num_tokens": 110379923.0, + "step": 1032 + }, + { + "epoch": 2.353477765108324, + "grad_norm": 3.171875, + "learning_rate": 4.546618967668369e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8808025866746902, + "num_tokens": 110487050.0, + "step": 1033 + }, + { + "epoch": 2.3557582668187003, + "grad_norm": 2.28125, + "learning_rate": 4.545536729447566e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8820274025201797, + "num_tokens": 110594620.0, + "step": 1034 + }, + { + "epoch": 2.3580387685290765, + "grad_norm": 3.359375, + "learning_rate": 4.544453330219606e-06, + "loss": 0.5961, + "mean_token_accuracy": 0.8768385797739029, + "num_tokens": 110701829.0, + "step": 1035 + }, + { + "epoch": 2.3603192702394526, + "grad_norm": 4.84375, + "learning_rate": 4.543368770599406e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8801583051681519, + "num_tokens": 110809252.0, + "step": 1036 + }, + { + "epoch": 2.362599771949829, + "grad_norm": 2.484375, + "learning_rate": 4.542283051202539e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.8764929324388504, + "num_tokens": 110916093.0, + "step": 1037 + }, + { + "epoch": 2.364880273660205, + "grad_norm": 6.09375, + "learning_rate": 4.541196172645242e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8785767704248428, + "num_tokens": 111023473.0, + "step": 1038 + }, + { + "epoch": 2.3671607753705817, + "grad_norm": 6.9375, + "learning_rate": 4.540108135544403e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8778847008943558, + "num_tokens": 111130747.0, + "step": 1039 + }, + { + "epoch": 2.369441277080958, + "grad_norm": 4.8125, + "learning_rate": 4.5390189405175725e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8778575360774994, + "num_tokens": 111237498.0, + "step": 1040 + }, + { + "epoch": 2.371721778791334, + "grad_norm": 2.328125, + "learning_rate": 4.537928588182955e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.8711429536342621, + "num_tokens": 111344543.0, + "step": 1041 + }, + { + "epoch": 2.3740022805017102, + "grad_norm": 3.125, + "learning_rate": 4.536837079159416e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.8806983381509781, + "num_tokens": 111451314.0, + "step": 1042 + }, + { + "epoch": 2.376282782212087, + "grad_norm": 3.09375, + "learning_rate": 4.535744414066473e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8755781650543213, + "num_tokens": 111558317.0, + "step": 1043 + }, + { + "epoch": 2.378563283922463, + "grad_norm": 2.53125, + "learning_rate": 4.534650593524302e-06, + "loss": 0.6024, + "mean_token_accuracy": 0.874798059463501, + "num_tokens": 111664986.0, + "step": 1044 + }, + { + "epoch": 2.3808437856328393, + "grad_norm": 4.03125, + "learning_rate": 4.533555618153735e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8804291188716888, + "num_tokens": 111772027.0, + "step": 1045 + }, + { + "epoch": 2.3831242873432155, + "grad_norm": 2.5, + "learning_rate": 4.532459488576258e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8797705769538879, + "num_tokens": 111878671.0, + "step": 1046 + }, + { + "epoch": 2.3854047890535917, + "grad_norm": 4.90625, + "learning_rate": 4.531362205414013e-06, + "loss": 0.59, + "mean_token_accuracy": 0.8787082433700562, + "num_tokens": 111985253.0, + "step": 1047 + }, + { + "epoch": 2.387685290763968, + "grad_norm": 3.5, + "learning_rate": 4.530263769289798e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.8787430375814438, + "num_tokens": 112092830.0, + "step": 1048 + }, + { + "epoch": 2.3899657924743445, + "grad_norm": 4.625, + "learning_rate": 4.529164180827063e-06, + "loss": 0.6002, + "mean_token_accuracy": 0.8749395608901978, + "num_tokens": 112199272.0, + "step": 1049 + }, + { + "epoch": 2.3922462941847207, + "grad_norm": 5.875, + "learning_rate": 4.528063440649913e-06, + "loss": 0.5932, + "mean_token_accuracy": 0.8786779493093491, + "num_tokens": 112306480.0, + "step": 1050 + }, + { + "epoch": 2.394526795895097, + "grad_norm": 2.1875, + "learning_rate": 4.526961549383109e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.8784288913011551, + "num_tokens": 112413379.0, + "step": 1051 + }, + { + "epoch": 2.396807297605473, + "grad_norm": 3.796875, + "learning_rate": 4.52585850765206e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.8735271543264389, + "num_tokens": 112520374.0, + "step": 1052 + }, + { + "epoch": 2.3990877993158497, + "grad_norm": 2.4375, + "learning_rate": 4.524754316082833e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8810184299945831, + "num_tokens": 112627321.0, + "step": 1053 + }, + { + "epoch": 2.401368301026226, + "grad_norm": 6.4375, + "learning_rate": 4.5236489753021465e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.8768182098865509, + "num_tokens": 112734136.0, + "step": 1054 + }, + { + "epoch": 2.403648802736602, + "grad_norm": 2.734375, + "learning_rate": 4.522542485937369e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8769848495721817, + "num_tokens": 112841135.0, + "step": 1055 + }, + { + "epoch": 2.4059293044469783, + "grad_norm": 3.71875, + "learning_rate": 4.521434848616523e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.8787764012813568, + "num_tokens": 112948501.0, + "step": 1056 + }, + { + "epoch": 2.4082098061573545, + "grad_norm": 2.40625, + "learning_rate": 4.520326063968283e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8776495456695557, + "num_tokens": 113055115.0, + "step": 1057 + }, + { + "epoch": 2.4104903078677307, + "grad_norm": 2.859375, + "learning_rate": 4.5192161326219716e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8828926682472229, + "num_tokens": 113162261.0, + "step": 1058 + }, + { + "epoch": 2.4127708095781073, + "grad_norm": 2.03125, + "learning_rate": 4.5181050552075665e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8777969181537628, + "num_tokens": 113269282.0, + "step": 1059 + }, + { + "epoch": 2.4150513112884835, + "grad_norm": 2.453125, + "learning_rate": 4.516992832355694e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8747198283672333, + "num_tokens": 113375649.0, + "step": 1060 + }, + { + "epoch": 2.4173318129988597, + "grad_norm": 2.53125, + "learning_rate": 4.515879464697629e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8800251632928848, + "num_tokens": 113483268.0, + "step": 1061 + }, + { + "epoch": 2.419612314709236, + "grad_norm": 2.53125, + "learning_rate": 4.514764952865297e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.8789681494235992, + "num_tokens": 113590044.0, + "step": 1062 + }, + { + "epoch": 2.4218928164196125, + "grad_norm": 4.34375, + "learning_rate": 4.513649297491275e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.8741051852703094, + "num_tokens": 113696761.0, + "step": 1063 + }, + { + "epoch": 2.4241733181299887, + "grad_norm": 2.78125, + "learning_rate": 4.512532499208787e-06, + "loss": 0.606, + "mean_token_accuracy": 0.8762391060590744, + "num_tokens": 113804097.0, + "step": 1064 + }, + { + "epoch": 2.426453819840365, + "grad_norm": 2.21875, + "learning_rate": 4.511414558651706e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.8823606818914413, + "num_tokens": 113911914.0, + "step": 1065 + }, + { + "epoch": 2.428734321550741, + "grad_norm": 2.953125, + "learning_rate": 4.5102954764545525e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8803216964006424, + "num_tokens": 114018818.0, + "step": 1066 + }, + { + "epoch": 2.4310148232611173, + "grad_norm": 2.671875, + "learning_rate": 4.509175253252497e-06, + "loss": 0.6118, + "mean_token_accuracy": 0.8786596357822418, + "num_tokens": 114125625.0, + "step": 1067 + }, + { + "epoch": 2.433295324971494, + "grad_norm": 2.6875, + "learning_rate": 4.508053889681357e-06, + "loss": 0.5957, + "mean_token_accuracy": 0.87969671189785, + "num_tokens": 114232527.0, + "step": 1068 + }, + { + "epoch": 2.43557582668187, + "grad_norm": 2.109375, + "learning_rate": 4.5069313863775956e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8781027346849442, + "num_tokens": 114339323.0, + "step": 1069 + }, + { + "epoch": 2.4378563283922463, + "grad_norm": 2.625, + "learning_rate": 4.505807743978325e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.877943754196167, + "num_tokens": 114446811.0, + "step": 1070 + }, + { + "epoch": 2.4401368301026225, + "grad_norm": 2.8125, + "learning_rate": 4.5046829631213014e-06, + "loss": 0.613, + "mean_token_accuracy": 0.8732447922229767, + "num_tokens": 114553671.0, + "step": 1071 + }, + { + "epoch": 2.4424173318129987, + "grad_norm": 2.890625, + "learning_rate": 4.503557044444931e-06, + "loss": 0.5986, + "mean_token_accuracy": 0.8797748982906342, + "num_tokens": 114660235.0, + "step": 1072 + }, + { + "epoch": 2.4446978335233753, + "grad_norm": 6.09375, + "learning_rate": 4.502429988588263e-06, + "loss": 0.6077, + "mean_token_accuracy": 0.8732910007238388, + "num_tokens": 114767907.0, + "step": 1073 + }, + { + "epoch": 2.4469783352337515, + "grad_norm": 3.640625, + "learning_rate": 4.50130179619099e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8783971816301346, + "num_tokens": 114874874.0, + "step": 1074 + }, + { + "epoch": 2.4492588369441277, + "grad_norm": 2.390625, + "learning_rate": 4.500172467893455e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8795461803674698, + "num_tokens": 114981618.0, + "step": 1075 + }, + { + "epoch": 2.451539338654504, + "grad_norm": 5.0625, + "learning_rate": 4.499042004336642e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.8823413103818893, + "num_tokens": 115088543.0, + "step": 1076 + }, + { + "epoch": 2.45381984036488, + "grad_norm": 6.90625, + "learning_rate": 4.497910406162182e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8751686066389084, + "num_tokens": 115195316.0, + "step": 1077 + }, + { + "epoch": 2.4561003420752567, + "grad_norm": 6.28125, + "learning_rate": 4.496777674012345e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.8768873512744904, + "num_tokens": 115302063.0, + "step": 1078 + }, + { + "epoch": 2.458380843785633, + "grad_norm": 2.125, + "learning_rate": 4.495643808530049e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.8746660053730011, + "num_tokens": 115409028.0, + "step": 1079 + }, + { + "epoch": 2.460661345496009, + "grad_norm": 4.375, + "learning_rate": 4.494508810358855e-06, + "loss": 0.5939, + "mean_token_accuracy": 0.8768034875392914, + "num_tokens": 115516132.0, + "step": 1080 + }, + { + "epoch": 2.4629418472063853, + "grad_norm": 5.0625, + "learning_rate": 4.4933726801429665e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8732063323259354, + "num_tokens": 115623543.0, + "step": 1081 + }, + { + "epoch": 2.4652223489167615, + "grad_norm": 6.0, + "learning_rate": 4.4922354185272275e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.875907376408577, + "num_tokens": 115730261.0, + "step": 1082 + }, + { + "epoch": 2.467502850627138, + "grad_norm": 5.90625, + "learning_rate": 4.491097026157127e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8774657398462296, + "num_tokens": 115837186.0, + "step": 1083 + }, + { + "epoch": 2.4697833523375143, + "grad_norm": 3.375, + "learning_rate": 4.489957503678794e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8786139786243439, + "num_tokens": 115944585.0, + "step": 1084 + }, + { + "epoch": 2.4720638540478905, + "grad_norm": 6.3125, + "learning_rate": 4.488816851738999e-06, + "loss": 0.5921, + "mean_token_accuracy": 0.881994903087616, + "num_tokens": 116051127.0, + "step": 1085 + }, + { + "epoch": 2.4743443557582667, + "grad_norm": 4.0, + "learning_rate": 4.487675070985156e-06, + "loss": 0.591, + "mean_token_accuracy": 0.879449263215065, + "num_tokens": 116158228.0, + "step": 1086 + }, + { + "epoch": 2.476624857468643, + "grad_norm": 3.5625, + "learning_rate": 4.4865321620653144e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8789650350809097, + "num_tokens": 116265116.0, + "step": 1087 + }, + { + "epoch": 2.4789053591790196, + "grad_norm": 4.34375, + "learning_rate": 4.485388125628171e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8748547732830048, + "num_tokens": 116372159.0, + "step": 1088 + }, + { + "epoch": 2.4811858608893957, + "grad_norm": 5.40625, + "learning_rate": 4.484242962323056e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8844872564077377, + "num_tokens": 116478884.0, + "step": 1089 + }, + { + "epoch": 2.483466362599772, + "grad_norm": 5.09375, + "learning_rate": 4.483096672799942e-06, + "loss": 0.597, + "mean_token_accuracy": 0.8777136653661728, + "num_tokens": 116585905.0, + "step": 1090 + }, + { + "epoch": 2.485746864310148, + "grad_norm": 4.3125, + "learning_rate": 4.481949257709442e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8820765465497971, + "num_tokens": 116693091.0, + "step": 1091 + }, + { + "epoch": 2.4880273660205243, + "grad_norm": 3.453125, + "learning_rate": 4.480800717702807e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.8752633780241013, + "num_tokens": 116801048.0, + "step": 1092 + }, + { + "epoch": 2.490307867730901, + "grad_norm": 2.046875, + "learning_rate": 4.479651053431926e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.8796508759260178, + "num_tokens": 116907982.0, + "step": 1093 + }, + { + "epoch": 2.492588369441277, + "grad_norm": 2.875, + "learning_rate": 4.4785002655493246e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8795785456895828, + "num_tokens": 117015153.0, + "step": 1094 + }, + { + "epoch": 2.4948688711516533, + "grad_norm": 2.34375, + "learning_rate": 4.477348354708169e-06, + "loss": 0.5995, + "mean_token_accuracy": 0.8773641586303711, + "num_tokens": 117122420.0, + "step": 1095 + }, + { + "epoch": 2.4971493728620295, + "grad_norm": 2.984375, + "learning_rate": 4.476195321562262e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.8779642879962921, + "num_tokens": 117229161.0, + "step": 1096 + }, + { + "epoch": 2.4994298745724057, + "grad_norm": 2.609375, + "learning_rate": 4.475041166766042e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8731478750705719, + "num_tokens": 117336152.0, + "step": 1097 + }, + { + "epoch": 2.5017103762827824, + "grad_norm": 4.875, + "learning_rate": 4.473885890974586e-06, + "loss": 0.614, + "mean_token_accuracy": 0.8745481222867966, + "num_tokens": 117443787.0, + "step": 1098 + }, + { + "epoch": 2.5039908779931586, + "grad_norm": 3.796875, + "learning_rate": 4.472729494843605e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8793596476316452, + "num_tokens": 117550788.0, + "step": 1099 + }, + { + "epoch": 2.5062713797035348, + "grad_norm": 2.65625, + "learning_rate": 4.471571979029448e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8826311677694321, + "num_tokens": 117658107.0, + "step": 1100 + }, + { + "epoch": 2.5062713797035348, + "eval_loss": 0.6002530455589294, + "eval_mean_token_accuracy": 0.8773398306433239, + "eval_num_tokens": 117658107.0, + "eval_runtime": 58.5432, + "eval_samples_per_second": 143.228, + "eval_steps_per_second": 4.492, + "step": 1100 + }, + { + "epoch": 2.508551881413911, + "grad_norm": 2.1875, + "learning_rate": 4.470413344189098e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8801444619894028, + "num_tokens": 117764556.0, + "step": 1101 + }, + { + "epoch": 2.5108323831242876, + "grad_norm": 3.0625, + "learning_rate": 4.469253590980175e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.8782180845737457, + "num_tokens": 117871482.0, + "step": 1102 + }, + { + "epoch": 2.5131128848346638, + "grad_norm": 2.46875, + "learning_rate": 4.46809272006093e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8766061216592789, + "num_tokens": 117978366.0, + "step": 1103 + }, + { + "epoch": 2.51539338654504, + "grad_norm": 2.53125, + "learning_rate": 4.466930732090254e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8791538327932358, + "num_tokens": 118085055.0, + "step": 1104 + }, + { + "epoch": 2.517673888255416, + "grad_norm": 2.765625, + "learning_rate": 4.465767627727668e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8798196315765381, + "num_tokens": 118192713.0, + "step": 1105 + }, + { + "epoch": 2.5199543899657924, + "grad_norm": 2.265625, + "learning_rate": 4.464603407633326e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8753893822431564, + "num_tokens": 118299578.0, + "step": 1106 + }, + { + "epoch": 2.5222348916761685, + "grad_norm": 2.4375, + "learning_rate": 4.463438072468018e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.8764231652021408, + "num_tokens": 118406017.0, + "step": 1107 + }, + { + "epoch": 2.524515393386545, + "grad_norm": 2.671875, + "learning_rate": 4.462271622893166e-06, + "loss": 0.6068, + "mean_token_accuracy": 0.8774004876613617, + "num_tokens": 118512926.0, + "step": 1108 + }, + { + "epoch": 2.5267958950969214, + "grad_norm": 2.859375, + "learning_rate": 4.461104059570825e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.8694035857915878, + "num_tokens": 118619939.0, + "step": 1109 + }, + { + "epoch": 2.5290763968072976, + "grad_norm": 2.546875, + "learning_rate": 4.4599353831636785e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8803573846817017, + "num_tokens": 118727362.0, + "step": 1110 + }, + { + "epoch": 2.5313568985176738, + "grad_norm": 6.84375, + "learning_rate": 4.458765594335048e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.877105325460434, + "num_tokens": 118834306.0, + "step": 1111 + }, + { + "epoch": 2.5336374002280504, + "grad_norm": 2.421875, + "learning_rate": 4.457594693748881e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.8765625059604645, + "num_tokens": 118941239.0, + "step": 1112 + }, + { + "epoch": 2.5359179019384266, + "grad_norm": 2.546875, + "learning_rate": 4.456422682069758e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8844843655824661, + "num_tokens": 119048172.0, + "step": 1113 + }, + { + "epoch": 2.538198403648803, + "grad_norm": 2.9375, + "learning_rate": 4.455249559962892e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.880544051527977, + "num_tokens": 119155079.0, + "step": 1114 + }, + { + "epoch": 2.540478905359179, + "grad_norm": 2.140625, + "learning_rate": 4.454075328094123e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8793693780899048, + "num_tokens": 119262757.0, + "step": 1115 + }, + { + "epoch": 2.542759407069555, + "grad_norm": 2.359375, + "learning_rate": 4.452899987129922e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8787418156862259, + "num_tokens": 119370566.0, + "step": 1116 + }, + { + "epoch": 2.5450399087799314, + "grad_norm": 2.625, + "learning_rate": 4.4517235377373915e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.8720010370016098, + "num_tokens": 119477698.0, + "step": 1117 + }, + { + "epoch": 2.547320410490308, + "grad_norm": 2.953125, + "learning_rate": 4.45054598058426e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8797213733196259, + "num_tokens": 119584695.0, + "step": 1118 + }, + { + "epoch": 2.549600912200684, + "grad_norm": 2.40625, + "learning_rate": 4.449367316338887e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8773613125085831, + "num_tokens": 119691823.0, + "step": 1119 + }, + { + "epoch": 2.5518814139110604, + "grad_norm": 2.796875, + "learning_rate": 4.448187545670258e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.881779134273529, + "num_tokens": 119799172.0, + "step": 1120 + }, + { + "epoch": 2.5541619156214366, + "grad_norm": 3.1875, + "learning_rate": 4.44700666924799e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.874766156077385, + "num_tokens": 119905784.0, + "step": 1121 + }, + { + "epoch": 2.556442417331813, + "grad_norm": 3.21875, + "learning_rate": 4.4458246877423254e-06, + "loss": 0.6032, + "mean_token_accuracy": 0.8753187209367752, + "num_tokens": 120012658.0, + "step": 1122 + }, + { + "epoch": 2.5587229190421894, + "grad_norm": 3.296875, + "learning_rate": 4.444641601824134e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8781961053609848, + "num_tokens": 120119347.0, + "step": 1123 + }, + { + "epoch": 2.5610034207525656, + "grad_norm": 2.765625, + "learning_rate": 4.443457412164911e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8751012235879898, + "num_tokens": 120226332.0, + "step": 1124 + }, + { + "epoch": 2.563283922462942, + "grad_norm": 3.03125, + "learning_rate": 4.442272119436781e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8773366808891296, + "num_tokens": 120334032.0, + "step": 1125 + }, + { + "epoch": 2.565564424173318, + "grad_norm": 3.5, + "learning_rate": 4.441085724312494e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8794488459825516, + "num_tokens": 120441226.0, + "step": 1126 + }, + { + "epoch": 2.567844925883694, + "grad_norm": 2.875, + "learning_rate": 4.4398982274654235e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8812543451786041, + "num_tokens": 120549055.0, + "step": 1127 + }, + { + "epoch": 2.570125427594071, + "grad_norm": 4.28125, + "learning_rate": 4.43870962956957e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8838596194982529, + "num_tokens": 120656376.0, + "step": 1128 + }, + { + "epoch": 2.572405929304447, + "grad_norm": 3.125, + "learning_rate": 4.437519931299559e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.878927618265152, + "num_tokens": 120764475.0, + "step": 1129 + }, + { + "epoch": 2.574686431014823, + "grad_norm": 2.609375, + "learning_rate": 4.43632913333064e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8791298568248749, + "num_tokens": 120871237.0, + "step": 1130 + }, + { + "epoch": 2.5769669327251994, + "grad_norm": 2.296875, + "learning_rate": 4.435137236338688e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8821601420640945, + "num_tokens": 120978615.0, + "step": 1131 + }, + { + "epoch": 2.579247434435576, + "grad_norm": 6.46875, + "learning_rate": 4.433944241000199e-06, + "loss": 0.6004, + "mean_token_accuracy": 0.8753758072853088, + "num_tokens": 121085644.0, + "step": 1132 + }, + { + "epoch": 2.581527936145952, + "grad_norm": 3.328125, + "learning_rate": 4.4327501479922955e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8809983879327774, + "num_tokens": 121193058.0, + "step": 1133 + }, + { + "epoch": 2.5838084378563284, + "grad_norm": 2.734375, + "learning_rate": 4.431554957992722e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.8755798637866974, + "num_tokens": 121299745.0, + "step": 1134 + }, + { + "epoch": 2.5860889395667046, + "grad_norm": 4.0625, + "learning_rate": 4.430358671679843e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.8684369623661041, + "num_tokens": 121406377.0, + "step": 1135 + }, + { + "epoch": 2.588369441277081, + "grad_norm": 3.03125, + "learning_rate": 4.42916128973265e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.8750972300767899, + "num_tokens": 121513300.0, + "step": 1136 + }, + { + "epoch": 2.590649942987457, + "grad_norm": 2.9375, + "learning_rate": 4.427962812830753e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.8743036985397339, + "num_tokens": 121620489.0, + "step": 1137 + }, + { + "epoch": 2.5929304446978336, + "grad_norm": 4.25, + "learning_rate": 4.426763241654383e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.8752316683530807, + "num_tokens": 121727667.0, + "step": 1138 + }, + { + "epoch": 2.59521094640821, + "grad_norm": 5.15625, + "learning_rate": 4.425562576884396e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.8789777606725693, + "num_tokens": 121834628.0, + "step": 1139 + }, + { + "epoch": 2.597491448118586, + "grad_norm": 4.125, + "learning_rate": 4.424360819202264e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8804343789815903, + "num_tokens": 121941613.0, + "step": 1140 + }, + { + "epoch": 2.5997719498289626, + "grad_norm": 2.59375, + "learning_rate": 4.423157969290081e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8748785555362701, + "num_tokens": 122048337.0, + "step": 1141 + }, + { + "epoch": 2.602052451539339, + "grad_norm": 3.328125, + "learning_rate": 4.421954027830565e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.8761484026908875, + "num_tokens": 122155743.0, + "step": 1142 + }, + { + "epoch": 2.604332953249715, + "grad_norm": 3.5625, + "learning_rate": 4.4207489955070465e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8797977864742279, + "num_tokens": 122262415.0, + "step": 1143 + }, + { + "epoch": 2.6066134549600912, + "grad_norm": 3.359375, + "learning_rate": 4.419542873003479e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.879091739654541, + "num_tokens": 122369829.0, + "step": 1144 + }, + { + "epoch": 2.6088939566704674, + "grad_norm": 3.921875, + "learning_rate": 4.418335661004436e-06, + "loss": 0.6004, + "mean_token_accuracy": 0.8749701827764511, + "num_tokens": 122476808.0, + "step": 1145 + }, + { + "epoch": 2.6111744583808436, + "grad_norm": 4.65625, + "learning_rate": 4.417127360195107e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.8770206123590469, + "num_tokens": 122583941.0, + "step": 1146 + }, + { + "epoch": 2.61345496009122, + "grad_norm": 2.328125, + "learning_rate": 4.415917971261299e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8799891471862793, + "num_tokens": 122691451.0, + "step": 1147 + }, + { + "epoch": 2.6157354618015964, + "grad_norm": 2.671875, + "learning_rate": 4.414707494889439e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8810625970363617, + "num_tokens": 122798569.0, + "step": 1148 + }, + { + "epoch": 2.6180159635119726, + "grad_norm": 2.34375, + "learning_rate": 4.413495931766571e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.8791490197181702, + "num_tokens": 122906142.0, + "step": 1149 + }, + { + "epoch": 2.620296465222349, + "grad_norm": 4.84375, + "learning_rate": 4.412283282580352e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.880540743470192, + "num_tokens": 123012808.0, + "step": 1150 + }, + { + "epoch": 2.6225769669327255, + "grad_norm": 2.296875, + "learning_rate": 4.41106954801906e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8770016133785248, + "num_tokens": 123119979.0, + "step": 1151 + }, + { + "epoch": 2.6248574686431017, + "grad_norm": 3.859375, + "learning_rate": 4.409854728771588e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8810955137014389, + "num_tokens": 123227161.0, + "step": 1152 + }, + { + "epoch": 2.627137970353478, + "grad_norm": 4.375, + "learning_rate": 4.4086388255274425e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8776374161243439, + "num_tokens": 123333866.0, + "step": 1153 + }, + { + "epoch": 2.629418472063854, + "grad_norm": 2.171875, + "learning_rate": 4.407421838976747e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8836002796888351, + "num_tokens": 123441095.0, + "step": 1154 + }, + { + "epoch": 2.6316989737742302, + "grad_norm": 2.390625, + "learning_rate": 4.40620376981024e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8822665065526962, + "num_tokens": 123548255.0, + "step": 1155 + }, + { + "epoch": 2.6339794754846064, + "grad_norm": 3.875, + "learning_rate": 4.404984618719275e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8759646117687225, + "num_tokens": 123654882.0, + "step": 1156 + }, + { + "epoch": 2.636259977194983, + "grad_norm": 3.609375, + "learning_rate": 4.403764386395817e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.8818103224039078, + "num_tokens": 123762537.0, + "step": 1157 + }, + { + "epoch": 2.6385404789053593, + "grad_norm": 2.28125, + "learning_rate": 4.402543073532446e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8845062106847763, + "num_tokens": 123870045.0, + "step": 1158 + }, + { + "epoch": 2.6408209806157354, + "grad_norm": 2.109375, + "learning_rate": 4.401320680822357e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.8820901811122894, + "num_tokens": 123977756.0, + "step": 1159 + }, + { + "epoch": 2.6431014823261116, + "grad_norm": 4.34375, + "learning_rate": 4.400097208959357e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8831377625465393, + "num_tokens": 124085720.0, + "step": 1160 + }, + { + "epoch": 2.6453819840364883, + "grad_norm": 5.375, + "learning_rate": 4.398872658637863e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.8773757070302963, + "num_tokens": 124192659.0, + "step": 1161 + }, + { + "epoch": 2.6476624857468645, + "grad_norm": 4.9375, + "learning_rate": 4.397647030552907e-06, + "loss": 0.6128, + "mean_token_accuracy": 0.8716171085834503, + "num_tokens": 124299363.0, + "step": 1162 + }, + { + "epoch": 2.6499429874572407, + "grad_norm": 4.75, + "learning_rate": 4.396420325400132e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.8754719495773315, + "num_tokens": 124406432.0, + "step": 1163 + }, + { + "epoch": 2.652223489167617, + "grad_norm": 8.6875, + "learning_rate": 4.3951925438757936e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.8721831738948822, + "num_tokens": 124513942.0, + "step": 1164 + }, + { + "epoch": 2.654503990877993, + "grad_norm": 5.6875, + "learning_rate": 4.3939636866767535e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.8772239238023758, + "num_tokens": 124621397.0, + "step": 1165 + }, + { + "epoch": 2.6567844925883692, + "grad_norm": 9.8125, + "learning_rate": 4.39273375450049e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.8729386478662491, + "num_tokens": 124728132.0, + "step": 1166 + }, + { + "epoch": 2.659064994298746, + "grad_norm": 5.09375, + "learning_rate": 4.391502748045088e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8794106990098953, + "num_tokens": 124834818.0, + "step": 1167 + }, + { + "epoch": 2.661345496009122, + "grad_norm": 3.859375, + "learning_rate": 4.390270668009244e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8794341683387756, + "num_tokens": 124941733.0, + "step": 1168 + }, + { + "epoch": 2.6636259977194983, + "grad_norm": 2.953125, + "learning_rate": 4.38903751509226e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8788381963968277, + "num_tokens": 125049080.0, + "step": 1169 + }, + { + "epoch": 2.6659064994298745, + "grad_norm": 3.546875, + "learning_rate": 4.3878032899940534e-06, + "loss": 0.601, + "mean_token_accuracy": 0.8787420690059662, + "num_tokens": 125156160.0, + "step": 1170 + }, + { + "epoch": 2.668187001140251, + "grad_norm": 6.1875, + "learning_rate": 4.386567993415144e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8779895603656769, + "num_tokens": 125263507.0, + "step": 1171 + }, + { + "epoch": 2.6704675028506273, + "grad_norm": 5.375, + "learning_rate": 4.3853316260566635e-06, + "loss": 0.5665, + "mean_token_accuracy": 0.8852428048849106, + "num_tokens": 125370490.0, + "step": 1172 + }, + { + "epoch": 2.6727480045610035, + "grad_norm": 5.03125, + "learning_rate": 4.384094188620349e-06, + "loss": 0.6148, + "mean_token_accuracy": 0.8747402876615524, + "num_tokens": 125477079.0, + "step": 1173 + }, + { + "epoch": 2.6750285062713797, + "grad_norm": 3.4375, + "learning_rate": 4.3828556818085485e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8805000483989716, + "num_tokens": 125584049.0, + "step": 1174 + }, + { + "epoch": 2.677309007981756, + "grad_norm": 2.890625, + "learning_rate": 4.3816161063242115e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8765310496091843, + "num_tokens": 125690629.0, + "step": 1175 + }, + { + "epoch": 2.679589509692132, + "grad_norm": 2.625, + "learning_rate": 4.3803754628708995e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8820624649524689, + "num_tokens": 125798232.0, + "step": 1176 + }, + { + "epoch": 2.6818700114025087, + "grad_norm": 3.65625, + "learning_rate": 4.379133752152776e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8863883018493652, + "num_tokens": 125905642.0, + "step": 1177 + }, + { + "epoch": 2.684150513112885, + "grad_norm": 2.25, + "learning_rate": 4.377890974874614e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8794945180416107, + "num_tokens": 126013558.0, + "step": 1178 + }, + { + "epoch": 2.686431014823261, + "grad_norm": 2.125, + "learning_rate": 4.376647131741787e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8828789293766022, + "num_tokens": 126120692.0, + "step": 1179 + }, + { + "epoch": 2.6887115165336373, + "grad_norm": 2.578125, + "learning_rate": 4.375402223460279e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.8755092322826385, + "num_tokens": 126227571.0, + "step": 1180 + }, + { + "epoch": 2.690992018244014, + "grad_norm": 5.15625, + "learning_rate": 4.3741562507366754e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8771520704030991, + "num_tokens": 126334666.0, + "step": 1181 + }, + { + "epoch": 2.69327251995439, + "grad_norm": 2.640625, + "learning_rate": 4.3729092142781655e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8833112865686417, + "num_tokens": 126441830.0, + "step": 1182 + }, + { + "epoch": 2.6955530216647663, + "grad_norm": 4.4375, + "learning_rate": 4.3716611147925435e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8818268924951553, + "num_tokens": 126548549.0, + "step": 1183 + }, + { + "epoch": 2.6978335233751425, + "grad_norm": 3.015625, + "learning_rate": 4.370411952988207e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8798163086175919, + "num_tokens": 126655776.0, + "step": 1184 + }, + { + "epoch": 2.7001140250855187, + "grad_norm": 2.484375, + "learning_rate": 4.369161729574155e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8758595883846283, + "num_tokens": 126762585.0, + "step": 1185 + }, + { + "epoch": 2.702394526795895, + "grad_norm": 2.671875, + "learning_rate": 4.367910445259991e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8808387517929077, + "num_tokens": 126869208.0, + "step": 1186 + }, + { + "epoch": 2.7046750285062715, + "grad_norm": 3.703125, + "learning_rate": 4.36665810075592e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.877717912197113, + "num_tokens": 126976031.0, + "step": 1187 + }, + { + "epoch": 2.7069555302166477, + "grad_norm": 3.328125, + "learning_rate": 4.365404696772748e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8818590492010117, + "num_tokens": 127083381.0, + "step": 1188 + }, + { + "epoch": 2.709236031927024, + "grad_norm": 2.0625, + "learning_rate": 4.364150234021883e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8820794522762299, + "num_tokens": 127190792.0, + "step": 1189 + }, + { + "epoch": 2.7115165336374, + "grad_norm": 2.859375, + "learning_rate": 4.362894713215334e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8834023624658585, + "num_tokens": 127298871.0, + "step": 1190 + }, + { + "epoch": 2.7137970353477767, + "grad_norm": 2.359375, + "learning_rate": 4.361638135065711e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8848878443241119, + "num_tokens": 127407231.0, + "step": 1191 + }, + { + "epoch": 2.716077537058153, + "grad_norm": 3.25, + "learning_rate": 4.360380500286222e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8773325383663177, + "num_tokens": 127514509.0, + "step": 1192 + }, + { + "epoch": 2.718358038768529, + "grad_norm": 2.921875, + "learning_rate": 4.359121809590678e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8788514882326126, + "num_tokens": 127621769.0, + "step": 1193 + }, + { + "epoch": 2.7206385404789053, + "grad_norm": 3.390625, + "learning_rate": 4.357862063693486e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.8736522942781448, + "num_tokens": 127728235.0, + "step": 1194 + }, + { + "epoch": 2.7229190421892815, + "grad_norm": 2.640625, + "learning_rate": 4.356601263309654e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8788127601146698, + "num_tokens": 127834915.0, + "step": 1195 + }, + { + "epoch": 2.7251995438996577, + "grad_norm": 2.5, + "learning_rate": 4.355339409154788e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.8762216120958328, + "num_tokens": 127942200.0, + "step": 1196 + }, + { + "epoch": 2.7274800456100343, + "grad_norm": 2.65625, + "learning_rate": 4.354076501945093e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.8778090626001358, + "num_tokens": 128049307.0, + "step": 1197 + }, + { + "epoch": 2.7297605473204105, + "grad_norm": 2.53125, + "learning_rate": 4.352812542397369e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8793000727891922, + "num_tokens": 128155836.0, + "step": 1198 + }, + { + "epoch": 2.7320410490307867, + "grad_norm": 2.546875, + "learning_rate": 4.351547531229016e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8827884644269943, + "num_tokens": 128262857.0, + "step": 1199 + }, + { + "epoch": 2.734321550741163, + "grad_norm": 4.21875, + "learning_rate": 4.350281469158029e-06, + "loss": 0.61, + "mean_token_accuracy": 0.8760698437690735, + "num_tokens": 128369643.0, + "step": 1200 + }, + { + "epoch": 2.7366020524515395, + "grad_norm": 2.5625, + "learning_rate": 4.3490143569030025e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8797038048505783, + "num_tokens": 128476571.0, + "step": 1201 + }, + { + "epoch": 2.7388825541619157, + "grad_norm": 4.21875, + "learning_rate": 4.347746195183123e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8761547356843948, + "num_tokens": 128583850.0, + "step": 1202 + }, + { + "epoch": 2.741163055872292, + "grad_norm": 4.28125, + "learning_rate": 4.346476984718176e-06, + "loss": 0.6078, + "mean_token_accuracy": 0.8726416677236557, + "num_tokens": 128691263.0, + "step": 1203 + }, + { + "epoch": 2.743443557582668, + "grad_norm": 5.59375, + "learning_rate": 4.345206726228538e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.8812550455331802, + "num_tokens": 128798421.0, + "step": 1204 + }, + { + "epoch": 2.7457240592930443, + "grad_norm": 2.21875, + "learning_rate": 4.343935420435187e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.8780511021614075, + "num_tokens": 128905472.0, + "step": 1205 + }, + { + "epoch": 2.7480045610034205, + "grad_norm": 2.484375, + "learning_rate": 4.34266306805969e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8798847049474716, + "num_tokens": 129012064.0, + "step": 1206 + }, + { + "epoch": 2.750285062713797, + "grad_norm": 3.171875, + "learning_rate": 4.341389669824209e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.8787447810173035, + "num_tokens": 129118812.0, + "step": 1207 + }, + { + "epoch": 2.7525655644241733, + "grad_norm": 4.0, + "learning_rate": 4.340115226451501e-06, + "loss": 0.602, + "mean_token_accuracy": 0.8773921728134155, + "num_tokens": 129226022.0, + "step": 1208 + }, + { + "epoch": 2.7548460661345495, + "grad_norm": 5.0625, + "learning_rate": 4.338839738664915e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8822150230407715, + "num_tokens": 129333172.0, + "step": 1209 + }, + { + "epoch": 2.757126567844926, + "grad_norm": 2.515625, + "learning_rate": 4.3375632071883935e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8766026943922043, + "num_tokens": 129440070.0, + "step": 1210 + }, + { + "epoch": 2.7594070695553023, + "grad_norm": 3.453125, + "learning_rate": 4.336285632746472e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8804485648870468, + "num_tokens": 129547114.0, + "step": 1211 + }, + { + "epoch": 2.7616875712656785, + "grad_norm": 2.59375, + "learning_rate": 4.3350070160642754e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8801703006029129, + "num_tokens": 129653799.0, + "step": 1212 + }, + { + "epoch": 2.7639680729760547, + "grad_norm": 3.5625, + "learning_rate": 4.333727357867523e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8849901556968689, + "num_tokens": 129761360.0, + "step": 1213 + }, + { + "epoch": 2.766248574686431, + "grad_norm": 2.5, + "learning_rate": 4.3324466588825235e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8838348835706711, + "num_tokens": 129868322.0, + "step": 1214 + }, + { + "epoch": 2.768529076396807, + "grad_norm": 2.125, + "learning_rate": 4.331164919836177e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.8776644319295883, + "num_tokens": 129975643.0, + "step": 1215 + }, + { + "epoch": 2.7708095781071833, + "grad_norm": 2.84375, + "learning_rate": 4.329882141455974e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8794203400611877, + "num_tokens": 130082157.0, + "step": 1216 + }, + { + "epoch": 2.77309007981756, + "grad_norm": 2.859375, + "learning_rate": 4.3285983244699955e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8823015242815018, + "num_tokens": 130189446.0, + "step": 1217 + }, + { + "epoch": 2.775370581527936, + "grad_norm": 2.796875, + "learning_rate": 4.327313469606911e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8795596957206726, + "num_tokens": 130296713.0, + "step": 1218 + }, + { + "epoch": 2.7776510832383123, + "grad_norm": 2.78125, + "learning_rate": 4.326027577595977e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8805474638938904, + "num_tokens": 130403572.0, + "step": 1219 + }, + { + "epoch": 2.779931584948689, + "grad_norm": 2.421875, + "learning_rate": 4.324740649167044e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8781325221061707, + "num_tokens": 130510616.0, + "step": 1220 + }, + { + "epoch": 2.782212086659065, + "grad_norm": 2.0625, + "learning_rate": 4.323452685050545e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8842640519142151, + "num_tokens": 130618088.0, + "step": 1221 + }, + { + "epoch": 2.7844925883694414, + "grad_norm": 2.40625, + "learning_rate": 4.3221636859775075e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8810494989156723, + "num_tokens": 130725134.0, + "step": 1222 + }, + { + "epoch": 2.7867730900798175, + "grad_norm": 4.03125, + "learning_rate": 4.320873652679538e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8809178620576859, + "num_tokens": 130832079.0, + "step": 1223 + }, + { + "epoch": 2.7890535917901937, + "grad_norm": 2.234375, + "learning_rate": 4.319582585888838e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8811828643083572, + "num_tokens": 130939236.0, + "step": 1224 + }, + { + "epoch": 2.79133409350057, + "grad_norm": 2.25, + "learning_rate": 4.31829048633819e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8782161772251129, + "num_tokens": 131046662.0, + "step": 1225 + }, + { + "epoch": 2.7936145952109466, + "grad_norm": 2.625, + "learning_rate": 4.316997354760965e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8795737326145172, + "num_tokens": 131153949.0, + "step": 1226 + }, + { + "epoch": 2.7958950969213228, + "grad_norm": 2.703125, + "learning_rate": 4.3157031918911204e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8794270306825638, + "num_tokens": 131260517.0, + "step": 1227 + }, + { + "epoch": 2.798175598631699, + "grad_norm": 2.5625, + "learning_rate": 4.314407998463198e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.881917729973793, + "num_tokens": 131367523.0, + "step": 1228 + }, + { + "epoch": 2.800456100342075, + "grad_norm": 6.40625, + "learning_rate": 4.3131117752123235e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8779633790254593, + "num_tokens": 131474262.0, + "step": 1229 + }, + { + "epoch": 2.802736602052452, + "grad_norm": 3.5, + "learning_rate": 4.311814522874209e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.8795278668403625, + "num_tokens": 131581107.0, + "step": 1230 + }, + { + "epoch": 2.805017103762828, + "grad_norm": 3.609375, + "learning_rate": 4.3105162421851494e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8744796365499496, + "num_tokens": 131687930.0, + "step": 1231 + }, + { + "epoch": 2.807297605473204, + "grad_norm": 5.8125, + "learning_rate": 4.309216933882025e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8764165937900543, + "num_tokens": 131794765.0, + "step": 1232 + }, + { + "epoch": 2.8095781071835804, + "grad_norm": 2.484375, + "learning_rate": 4.307916598702296e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.883644163608551, + "num_tokens": 131902317.0, + "step": 1233 + }, + { + "epoch": 2.8118586088939566, + "grad_norm": 3.65625, + "learning_rate": 4.3066152373840105e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.8781489878892899, + "num_tokens": 132009851.0, + "step": 1234 + }, + { + "epoch": 2.8141391106043327, + "grad_norm": 2.71875, + "learning_rate": 4.305312850665794e-06, + "loss": 0.5967, + "mean_token_accuracy": 0.8778474628925323, + "num_tokens": 132116939.0, + "step": 1235 + }, + { + "epoch": 2.8164196123147094, + "grad_norm": 4.0, + "learning_rate": 4.304009439286855e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.8780102729797363, + "num_tokens": 132223406.0, + "step": 1236 + }, + { + "epoch": 2.8187001140250856, + "grad_norm": 2.953125, + "learning_rate": 4.3027050039869865e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8773478418588638, + "num_tokens": 132330700.0, + "step": 1237 + }, + { + "epoch": 2.8209806157354618, + "grad_norm": 2.65625, + "learning_rate": 4.301399545506561e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8794009536504745, + "num_tokens": 132437921.0, + "step": 1238 + }, + { + "epoch": 2.823261117445838, + "grad_norm": 2.421875, + "learning_rate": 4.3000930645865305e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8809588998556137, + "num_tokens": 132545056.0, + "step": 1239 + }, + { + "epoch": 2.8255416191562146, + "grad_norm": 2.453125, + "learning_rate": 4.298785561968428e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8774296343326569, + "num_tokens": 132651620.0, + "step": 1240 + }, + { + "epoch": 2.827822120866591, + "grad_norm": 2.90625, + "learning_rate": 4.297477038394368e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8836159557104111, + "num_tokens": 132758652.0, + "step": 1241 + }, + { + "epoch": 2.830102622576967, + "grad_norm": 2.53125, + "learning_rate": 4.296167494607043e-06, + "loss": 0.584, + "mean_token_accuracy": 0.8778567016124725, + "num_tokens": 132866236.0, + "step": 1242 + }, + { + "epoch": 2.832383124287343, + "grad_norm": 3.078125, + "learning_rate": 4.294856931349724e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8745770305395126, + "num_tokens": 132973334.0, + "step": 1243 + }, + { + "epoch": 2.8346636259977194, + "grad_norm": 2.515625, + "learning_rate": 4.293545349366262e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.8771042227745056, + "num_tokens": 133079721.0, + "step": 1244 + }, + { + "epoch": 2.8369441277080956, + "grad_norm": 2.578125, + "learning_rate": 4.292232749401085e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8794742226600647, + "num_tokens": 133186577.0, + "step": 1245 + }, + { + "epoch": 2.839224629418472, + "grad_norm": 2.703125, + "learning_rate": 4.2909191321992e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.8782872408628464, + "num_tokens": 133293871.0, + "step": 1246 + }, + { + "epoch": 2.8415051311288484, + "grad_norm": 2.921875, + "learning_rate": 4.2896044985061915e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.8800533413887024, + "num_tokens": 133400863.0, + "step": 1247 + }, + { + "epoch": 2.8437856328392246, + "grad_norm": 2.421875, + "learning_rate": 4.288288849068218e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8809787184000015, + "num_tokens": 133507932.0, + "step": 1248 + }, + { + "epoch": 2.846066134549601, + "grad_norm": 3.25, + "learning_rate": 4.286972184632019e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8738250583410263, + "num_tokens": 133615287.0, + "step": 1249 + }, + { + "epoch": 2.8483466362599774, + "grad_norm": 2.140625, + "learning_rate": 4.285654505944906e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.8837039470672607, + "num_tokens": 133722203.0, + "step": 1250 + }, + { + "epoch": 2.8506271379703536, + "grad_norm": 3.78125, + "learning_rate": 4.28433581375477e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8851412385702133, + "num_tokens": 133829983.0, + "step": 1251 + }, + { + "epoch": 2.85290763968073, + "grad_norm": 3.5, + "learning_rate": 4.283016108810073e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.8773223906755447, + "num_tokens": 133937207.0, + "step": 1252 + }, + { + "epoch": 2.855188141391106, + "grad_norm": 2.328125, + "learning_rate": 4.281695391859854e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8886352777481079, + "num_tokens": 134044589.0, + "step": 1253 + }, + { + "epoch": 2.857468643101482, + "grad_norm": 5.09375, + "learning_rate": 4.28037366365373e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8755118399858475, + "num_tokens": 134151607.0, + "step": 1254 + }, + { + "epoch": 2.8597491448118584, + "grad_norm": 2.8125, + "learning_rate": 4.279050924941885e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8779601603746414, + "num_tokens": 134258813.0, + "step": 1255 + }, + { + "epoch": 2.862029646522235, + "grad_norm": 3.4375, + "learning_rate": 4.2777271764750805e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8823637515306473, + "num_tokens": 134366131.0, + "step": 1256 + }, + { + "epoch": 2.864310148232611, + "grad_norm": 2.125, + "learning_rate": 4.276402419004652e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.8769189864397049, + "num_tokens": 134472934.0, + "step": 1257 + }, + { + "epoch": 2.8665906499429874, + "grad_norm": 2.3125, + "learning_rate": 4.275076653282504e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.8746793568134308, + "num_tokens": 134579651.0, + "step": 1258 + }, + { + "epoch": 2.8688711516533636, + "grad_norm": 3.59375, + "learning_rate": 4.273749880061118e-06, + "loss": 0.599, + "mean_token_accuracy": 0.8750852793455124, + "num_tokens": 134686349.0, + "step": 1259 + }, + { + "epoch": 2.8711516533637402, + "grad_norm": 3.484375, + "learning_rate": 4.272422100093542e-06, + "loss": 0.6092, + "mean_token_accuracy": 0.8773278295993805, + "num_tokens": 134793245.0, + "step": 1260 + }, + { + "epoch": 2.8734321550741164, + "grad_norm": 3.28125, + "learning_rate": 4.271093314133401e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.8744064420461655, + "num_tokens": 134900346.0, + "step": 1261 + }, + { + "epoch": 2.8757126567844926, + "grad_norm": 2.671875, + "learning_rate": 4.269763522934888e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8789143562316895, + "num_tokens": 135007561.0, + "step": 1262 + }, + { + "epoch": 2.877993158494869, + "grad_norm": 7.625, + "learning_rate": 4.268432727252765e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8737368434667587, + "num_tokens": 135115241.0, + "step": 1263 + }, + { + "epoch": 2.880273660205245, + "grad_norm": 5.4375, + "learning_rate": 4.2671009278423665e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8808005452156067, + "num_tokens": 135221997.0, + "step": 1264 + }, + { + "epoch": 2.882554161915621, + "grad_norm": 3.890625, + "learning_rate": 4.265768125459597e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.879405215382576, + "num_tokens": 135329159.0, + "step": 1265 + }, + { + "epoch": 2.884834663625998, + "grad_norm": 2.53125, + "learning_rate": 4.264434320860929e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8776859790086746, + "num_tokens": 135436122.0, + "step": 1266 + }, + { + "epoch": 2.887115165336374, + "grad_norm": 2.921875, + "learning_rate": 4.2630995148034044e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.8750355541706085, + "num_tokens": 135542672.0, + "step": 1267 + }, + { + "epoch": 2.88939566704675, + "grad_norm": 2.265625, + "learning_rate": 4.261763708044633e-06, + "loss": 0.5957, + "mean_token_accuracy": 0.8799539804458618, + "num_tokens": 135649649.0, + "step": 1268 + }, + { + "epoch": 2.8916761687571264, + "grad_norm": 2.953125, + "learning_rate": 4.2604269013427925e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8781967163085938, + "num_tokens": 135756953.0, + "step": 1269 + }, + { + "epoch": 2.893956670467503, + "grad_norm": 3.03125, + "learning_rate": 4.25908909545663e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.8765884786844254, + "num_tokens": 135863953.0, + "step": 1270 + }, + { + "epoch": 2.8962371721778792, + "grad_norm": 3.890625, + "learning_rate": 4.257750291145457e-06, + "loss": 0.611, + "mean_token_accuracy": 0.8713682293891907, + "num_tokens": 135970439.0, + "step": 1271 + }, + { + "epoch": 2.8985176738882554, + "grad_norm": 2.125, + "learning_rate": 4.256410489169154e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8807791769504547, + "num_tokens": 136077645.0, + "step": 1272 + }, + { + "epoch": 2.9007981755986316, + "grad_norm": 5.125, + "learning_rate": 4.255069690288166e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8755863457918167, + "num_tokens": 136184852.0, + "step": 1273 + }, + { + "epoch": 2.903078677309008, + "grad_norm": 2.265625, + "learning_rate": 4.253727895263504e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8805266171693802, + "num_tokens": 136292297.0, + "step": 1274 + }, + { + "epoch": 2.905359179019384, + "grad_norm": 2.4375, + "learning_rate": 4.252385104856746e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8784385919570923, + "num_tokens": 136399187.0, + "step": 1275 + }, + { + "epoch": 2.9076396807297606, + "grad_norm": 2.78125, + "learning_rate": 4.251041319830034e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.8847327828407288, + "num_tokens": 136506050.0, + "step": 1276 + }, + { + "epoch": 2.909920182440137, + "grad_norm": 2.640625, + "learning_rate": 4.249696540946074e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8823814243078232, + "num_tokens": 136613948.0, + "step": 1277 + }, + { + "epoch": 2.912200684150513, + "grad_norm": 2.40625, + "learning_rate": 4.248350768968136e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8782215118408203, + "num_tokens": 136720344.0, + "step": 1278 + }, + { + "epoch": 2.9144811858608897, + "grad_norm": 2.6875, + "learning_rate": 4.247004004660055e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8780831098556519, + "num_tokens": 136827167.0, + "step": 1279 + }, + { + "epoch": 2.916761687571266, + "grad_norm": 2.359375, + "learning_rate": 4.245656248786228e-06, + "loss": 0.573, + "mean_token_accuracy": 0.880301833152771, + "num_tokens": 136933946.0, + "step": 1280 + }, + { + "epoch": 2.919042189281642, + "grad_norm": 2.9375, + "learning_rate": 4.2443075021116166e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8767362087965012, + "num_tokens": 137040549.0, + "step": 1281 + }, + { + "epoch": 2.9213226909920182, + "grad_norm": 2.65625, + "learning_rate": 4.242957765401741e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8834853321313858, + "num_tokens": 137147698.0, + "step": 1282 + }, + { + "epoch": 2.9236031927023944, + "grad_norm": 2.765625, + "learning_rate": 4.241607039422687e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8839240074157715, + "num_tokens": 137254610.0, + "step": 1283 + }, + { + "epoch": 2.9258836944127706, + "grad_norm": 2.484375, + "learning_rate": 4.2402553249411e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8824329972267151, + "num_tokens": 137361723.0, + "step": 1284 + }, + { + "epoch": 2.928164196123147, + "grad_norm": 3.234375, + "learning_rate": 4.238902622724188e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.8801659643650055, + "num_tokens": 137469233.0, + "step": 1285 + }, + { + "epoch": 2.9304446978335235, + "grad_norm": 3.171875, + "learning_rate": 4.237548933539718e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8831808865070343, + "num_tokens": 137575939.0, + "step": 1286 + }, + { + "epoch": 2.9327251995438997, + "grad_norm": 2.65625, + "learning_rate": 4.236194258156019e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8819428384304047, + "num_tokens": 137683411.0, + "step": 1287 + }, + { + "epoch": 2.935005701254276, + "grad_norm": 2.703125, + "learning_rate": 4.234838597341977e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8816139101982117, + "num_tokens": 137790631.0, + "step": 1288 + }, + { + "epoch": 2.9372862029646525, + "grad_norm": 3.421875, + "learning_rate": 4.233481951867039e-06, + "loss": 0.6042, + "mean_token_accuracy": 0.8764592856168747, + "num_tokens": 137897306.0, + "step": 1289 + }, + { + "epoch": 2.9395667046750287, + "grad_norm": 2.375, + "learning_rate": 4.232124322501212e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8771606087684631, + "num_tokens": 138004247.0, + "step": 1290 + }, + { + "epoch": 2.941847206385405, + "grad_norm": 2.53125, + "learning_rate": 4.230765710015058e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8792049288749695, + "num_tokens": 138110738.0, + "step": 1291 + }, + { + "epoch": 2.944127708095781, + "grad_norm": 2.359375, + "learning_rate": 4.229406115179703e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8832272589206696, + "num_tokens": 138218085.0, + "step": 1292 + }, + { + "epoch": 2.9464082098061573, + "grad_norm": 4.34375, + "learning_rate": 4.228045538766823e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.8802038431167603, + "num_tokens": 138324630.0, + "step": 1293 + }, + { + "epoch": 2.9486887115165334, + "grad_norm": 3.734375, + "learning_rate": 4.226683981548656e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.8761596828699112, + "num_tokens": 138431811.0, + "step": 1294 + }, + { + "epoch": 2.95096921322691, + "grad_norm": 3.8125, + "learning_rate": 4.2253214442979975e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8778193593025208, + "num_tokens": 138539059.0, + "step": 1295 + }, + { + "epoch": 2.9532497149372863, + "grad_norm": 6.34375, + "learning_rate": 4.223957927788195e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.8704663217067719, + "num_tokens": 138646166.0, + "step": 1296 + }, + { + "epoch": 2.9555302166476625, + "grad_norm": 7.0625, + "learning_rate": 4.222593432793155e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.879715234041214, + "num_tokens": 138753154.0, + "step": 1297 + }, + { + "epoch": 2.9578107183580387, + "grad_norm": 3.421875, + "learning_rate": 4.2212279600873385e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8822585344314575, + "num_tokens": 138860601.0, + "step": 1298 + }, + { + "epoch": 2.9600912200684153, + "grad_norm": 6.03125, + "learning_rate": 4.219861510445762e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8831232637166977, + "num_tokens": 138967531.0, + "step": 1299 + }, + { + "epoch": 2.9623717217787915, + "grad_norm": 3.78125, + "learning_rate": 4.2184940846439946e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.8782211989164352, + "num_tokens": 139074913.0, + "step": 1300 + }, + { + "epoch": 2.9646522234891677, + "grad_norm": 9.3125, + "learning_rate": 4.217125683458162e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8801289498806, + "num_tokens": 139182021.0, + "step": 1301 + }, + { + "epoch": 2.966932725199544, + "grad_norm": 11.3125, + "learning_rate": 4.215756307664941e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8780060112476349, + "num_tokens": 139289235.0, + "step": 1302 + }, + { + "epoch": 2.96921322690992, + "grad_norm": 13.75, + "learning_rate": 4.214385958041565e-06, + "loss": 0.6209, + "mean_token_accuracy": 0.8715900182723999, + "num_tokens": 139396474.0, + "step": 1303 + }, + { + "epoch": 2.9714937286202963, + "grad_norm": 9.8125, + "learning_rate": 4.213014635365816e-06, + "loss": 0.614, + "mean_token_accuracy": 0.8730522990226746, + "num_tokens": 139502964.0, + "step": 1304 + }, + { + "epoch": 2.973774230330673, + "grad_norm": 7.125, + "learning_rate": 4.2116423404160316e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8835666477680206, + "num_tokens": 139610473.0, + "step": 1305 + }, + { + "epoch": 2.976054732041049, + "grad_norm": 2.71875, + "learning_rate": 4.210269073971098e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8774100840091705, + "num_tokens": 139718229.0, + "step": 1306 + }, + { + "epoch": 2.9783352337514253, + "grad_norm": 4.0, + "learning_rate": 4.208894836810457e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8828433156013489, + "num_tokens": 139825370.0, + "step": 1307 + }, + { + "epoch": 2.9806157354618015, + "grad_norm": 3.90625, + "learning_rate": 4.207519629714099e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8787718713283539, + "num_tokens": 139932155.0, + "step": 1308 + }, + { + "epoch": 2.982896237172178, + "grad_norm": 6.28125, + "learning_rate": 4.206143453462562e-06, + "loss": 0.5967, + "mean_token_accuracy": 0.8779617547988892, + "num_tokens": 140039280.0, + "step": 1309 + }, + { + "epoch": 2.9851767388825543, + "grad_norm": 2.78125, + "learning_rate": 4.204766308836941e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8793751299381256, + "num_tokens": 140146757.0, + "step": 1310 + }, + { + "epoch": 2.9874572405929305, + "grad_norm": 2.71875, + "learning_rate": 4.203388196618874e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.8730627149343491, + "num_tokens": 140253574.0, + "step": 1311 + }, + { + "epoch": 2.9897377423033067, + "grad_norm": 5.25, + "learning_rate": 4.202009117590552e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.8779786825180054, + "num_tokens": 140360312.0, + "step": 1312 + }, + { + "epoch": 2.992018244013683, + "grad_norm": 5.1875, + "learning_rate": 4.200629072534713e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.8807083070278168, + "num_tokens": 140467414.0, + "step": 1313 + }, + { + "epoch": 2.994298745724059, + "grad_norm": 6.75, + "learning_rate": 4.1992480622346455e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.8760360926389694, + "num_tokens": 140574002.0, + "step": 1314 + }, + { + "epoch": 2.9965792474344357, + "grad_norm": 3.6875, + "learning_rate": 4.197866087474181e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8772165775299072, + "num_tokens": 140680798.0, + "step": 1315 + }, + { + "epoch": 2.998859749144812, + "grad_norm": 2.9375, + "learning_rate": 4.196483149037707e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.8783533871173859, + "num_tokens": 140787588.0, + "step": 1316 + }, + { + "epoch": 3.0, + "grad_norm": 7.1875, + "learning_rate": 4.195099247710147e-06, + "loss": 0.624, + "mean_token_accuracy": 0.8783180117607117, + "num_tokens": 140826696.0, + "step": 1317 + }, + { + "epoch": 3.002280501710376, + "grad_norm": 5.28125, + "learning_rate": 4.1937143842769805e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8779798746109009, + "num_tokens": 140933415.0, + "step": 1318 + }, + { + "epoch": 3.0045610034207524, + "grad_norm": 3.359375, + "learning_rate": 4.192328559524227e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8831545263528824, + "num_tokens": 141041011.0, + "step": 1319 + }, + { + "epoch": 3.006841505131129, + "grad_norm": 3.53125, + "learning_rate": 4.190941774238454e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8806227445602417, + "num_tokens": 141148541.0, + "step": 1320 + }, + { + "epoch": 3.006841505131129, + "eval_loss": 0.596246063709259, + "eval_mean_token_accuracy": 0.8780989914339304, + "eval_num_tokens": 141148541.0, + "eval_runtime": 58.6547, + "eval_samples_per_second": 142.955, + "eval_steps_per_second": 4.484, + "step": 1320 + }, + { + "epoch": 3.009122006841505, + "grad_norm": 2.484375, + "learning_rate": 4.1895540292067765e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8786788433790207, + "num_tokens": 141255428.0, + "step": 1321 + }, + { + "epoch": 3.0114025085518814, + "grad_norm": 5.5625, + "learning_rate": 4.18816532521685e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8818920701742172, + "num_tokens": 141362931.0, + "step": 1322 + }, + { + "epoch": 3.0136830102622576, + "grad_norm": 8.25, + "learning_rate": 4.1867756630568755e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8816585689783096, + "num_tokens": 141470783.0, + "step": 1323 + }, + { + "epoch": 3.015963511972634, + "grad_norm": 5.65625, + "learning_rate": 4.1853850435156e-06, + "loss": 0.5817, + "mean_token_accuracy": 0.8797392249107361, + "num_tokens": 141578103.0, + "step": 1324 + }, + { + "epoch": 3.0182440136830104, + "grad_norm": 4.125, + "learning_rate": 4.18399346738231e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8776365965604782, + "num_tokens": 141684405.0, + "step": 1325 + }, + { + "epoch": 3.0205245153933866, + "grad_norm": 2.671875, + "learning_rate": 4.18260093544684e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8821319788694382, + "num_tokens": 141790905.0, + "step": 1326 + }, + { + "epoch": 3.022805017103763, + "grad_norm": 5.0625, + "learning_rate": 4.181207448499562e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.8799286335706711, + "num_tokens": 141898196.0, + "step": 1327 + }, + { + "epoch": 3.025085518814139, + "grad_norm": 2.953125, + "learning_rate": 4.179813007331394e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8794847130775452, + "num_tokens": 142005139.0, + "step": 1328 + }, + { + "epoch": 3.027366020524515, + "grad_norm": 3.890625, + "learning_rate": 4.178417612733792e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8797455281019211, + "num_tokens": 142111855.0, + "step": 1329 + }, + { + "epoch": 3.029646522234892, + "grad_norm": 6.53125, + "learning_rate": 4.177021265498757e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.879165843129158, + "num_tokens": 142218523.0, + "step": 1330 + }, + { + "epoch": 3.031927023945268, + "grad_norm": 3.34375, + "learning_rate": 4.1756239664188275e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8838147222995758, + "num_tokens": 142325909.0, + "step": 1331 + }, + { + "epoch": 3.034207525655644, + "grad_norm": 2.21875, + "learning_rate": 4.1742257162870835e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8792033344507217, + "num_tokens": 142433088.0, + "step": 1332 + }, + { + "epoch": 3.0364880273660204, + "grad_norm": 2.78125, + "learning_rate": 4.172826515897146e-06, + "loss": 0.6002, + "mean_token_accuracy": 0.8756905943155289, + "num_tokens": 142540035.0, + "step": 1333 + }, + { + "epoch": 3.0387685290763966, + "grad_norm": 4.59375, + "learning_rate": 4.171426366043172e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8791181296110153, + "num_tokens": 142647392.0, + "step": 1334 + }, + { + "epoch": 3.0410490307867732, + "grad_norm": 3.5625, + "learning_rate": 4.170025267519862e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8790037631988525, + "num_tokens": 142754270.0, + "step": 1335 + }, + { + "epoch": 3.0433295324971494, + "grad_norm": 5.28125, + "learning_rate": 4.168623221122451e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8844562470912933, + "num_tokens": 142861663.0, + "step": 1336 + }, + { + "epoch": 3.0456100342075256, + "grad_norm": 2.796875, + "learning_rate": 4.167220227646713e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8789408653974533, + "num_tokens": 142968943.0, + "step": 1337 + }, + { + "epoch": 3.047890535917902, + "grad_norm": 4.0625, + "learning_rate": 4.165816287888962e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.879507377743721, + "num_tokens": 143076504.0, + "step": 1338 + }, + { + "epoch": 3.050171037628278, + "grad_norm": 2.5, + "learning_rate": 4.164411402646045e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8781361430883408, + "num_tokens": 143183423.0, + "step": 1339 + }, + { + "epoch": 3.0524515393386547, + "grad_norm": 5.46875, + "learning_rate": 4.163005572715348e-06, + "loss": 0.5969, + "mean_token_accuracy": 0.879531055688858, + "num_tokens": 143290366.0, + "step": 1340 + }, + { + "epoch": 3.054732041049031, + "grad_norm": 3.796875, + "learning_rate": 4.161598798894795e-06, + "loss": 0.595, + "mean_token_accuracy": 0.8771509379148483, + "num_tokens": 143397090.0, + "step": 1341 + }, + { + "epoch": 3.057012542759407, + "grad_norm": 2.28125, + "learning_rate": 4.160191081982841e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.8763537108898163, + "num_tokens": 143504171.0, + "step": 1342 + }, + { + "epoch": 3.0592930444697832, + "grad_norm": 2.734375, + "learning_rate": 4.15878242277848e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8811188638210297, + "num_tokens": 143612068.0, + "step": 1343 + }, + { + "epoch": 3.0615735461801594, + "grad_norm": 2.5625, + "learning_rate": 4.157372822081241e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.8822892308235168, + "num_tokens": 143719411.0, + "step": 1344 + }, + { + "epoch": 3.063854047890536, + "grad_norm": 3.09375, + "learning_rate": 4.155962280691184e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8773934096097946, + "num_tokens": 143826292.0, + "step": 1345 + }, + { + "epoch": 3.0661345496009123, + "grad_norm": 6.125, + "learning_rate": 4.154550799408906e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8825222551822662, + "num_tokens": 143933142.0, + "step": 1346 + }, + { + "epoch": 3.0684150513112884, + "grad_norm": 4.125, + "learning_rate": 4.153138379035537e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8797077685594559, + "num_tokens": 144040202.0, + "step": 1347 + }, + { + "epoch": 3.0706955530216646, + "grad_norm": 4.25, + "learning_rate": 4.1517250203727395e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8776738494634628, + "num_tokens": 144147431.0, + "step": 1348 + }, + { + "epoch": 3.072976054732041, + "grad_norm": 3.734375, + "learning_rate": 4.150310724222708e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8820817023515701, + "num_tokens": 144254552.0, + "step": 1349 + }, + { + "epoch": 3.0752565564424175, + "grad_norm": 3.5625, + "learning_rate": 4.14889549138817e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8843452036380768, + "num_tokens": 144362233.0, + "step": 1350 + }, + { + "epoch": 3.0775370581527937, + "grad_norm": 2.65625, + "learning_rate": 4.147479322672383e-06, + "loss": 0.5932, + "mean_token_accuracy": 0.8773269057273865, + "num_tokens": 144469328.0, + "step": 1351 + }, + { + "epoch": 3.07981755986317, + "grad_norm": 2.484375, + "learning_rate": 4.14606221887914e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.882498249411583, + "num_tokens": 144576504.0, + "step": 1352 + }, + { + "epoch": 3.082098061573546, + "grad_norm": 4.0625, + "learning_rate": 4.144644180812759e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8806255161762238, + "num_tokens": 144683928.0, + "step": 1353 + }, + { + "epoch": 3.0843785632839227, + "grad_norm": 2.875, + "learning_rate": 4.143225209278093e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.8764054775238037, + "num_tokens": 144791032.0, + "step": 1354 + }, + { + "epoch": 3.086659064994299, + "grad_norm": 3.5625, + "learning_rate": 4.141805305080521e-06, + "loss": 0.6094, + "mean_token_accuracy": 0.8736419975757599, + "num_tokens": 144897700.0, + "step": 1355 + }, + { + "epoch": 3.088939566704675, + "grad_norm": 3.703125, + "learning_rate": 4.1403844690259544e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.8770473003387451, + "num_tokens": 145004603.0, + "step": 1356 + }, + { + "epoch": 3.0912200684150513, + "grad_norm": 2.03125, + "learning_rate": 4.138962701920831e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8825189918279648, + "num_tokens": 145112050.0, + "step": 1357 + }, + { + "epoch": 3.0935005701254275, + "grad_norm": 2.609375, + "learning_rate": 4.13754000457212e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8764393627643585, + "num_tokens": 145218872.0, + "step": 1358 + }, + { + "epoch": 3.095781071835804, + "grad_norm": 2.8125, + "learning_rate": 4.136116377787317e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.8764399290084839, + "num_tokens": 145325751.0, + "step": 1359 + }, + { + "epoch": 3.0980615735461803, + "grad_norm": 2.5, + "learning_rate": 4.134691822374445e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.8835193067789078, + "num_tokens": 145432654.0, + "step": 1360 + }, + { + "epoch": 3.1003420752565565, + "grad_norm": 3.0, + "learning_rate": 4.1332663391420515e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8790077865123749, + "num_tokens": 145539487.0, + "step": 1361 + }, + { + "epoch": 3.1026225769669327, + "grad_norm": 2.328125, + "learning_rate": 4.131839928899217e-06, + "loss": 0.58, + "mean_token_accuracy": 0.880421444773674, + "num_tokens": 145647251.0, + "step": 1362 + }, + { + "epoch": 3.104903078677309, + "grad_norm": 2.4375, + "learning_rate": 4.130412592455542e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8812432438135147, + "num_tokens": 145754411.0, + "step": 1363 + }, + { + "epoch": 3.1071835803876855, + "grad_norm": 3.15625, + "learning_rate": 4.128984330621157e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8790383189916611, + "num_tokens": 145861509.0, + "step": 1364 + }, + { + "epoch": 3.1094640820980617, + "grad_norm": 3.453125, + "learning_rate": 4.127555144206713e-06, + "loss": 0.5979, + "mean_token_accuracy": 0.8779343664646149, + "num_tokens": 145968759.0, + "step": 1365 + }, + { + "epoch": 3.111744583808438, + "grad_norm": 2.46875, + "learning_rate": 4.126125034023392e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8722660690546036, + "num_tokens": 146075699.0, + "step": 1366 + }, + { + "epoch": 3.114025085518814, + "grad_norm": 4.03125, + "learning_rate": 4.124694000882894e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8754329532384872, + "num_tokens": 146183526.0, + "step": 1367 + }, + { + "epoch": 3.1163055872291903, + "grad_norm": 2.359375, + "learning_rate": 4.123262045597447e-06, + "loss": 0.601, + "mean_token_accuracy": 0.8771016895771027, + "num_tokens": 146290729.0, + "step": 1368 + }, + { + "epoch": 3.118586088939567, + "grad_norm": 3.71875, + "learning_rate": 4.121829168979802e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8831167221069336, + "num_tokens": 146397777.0, + "step": 1369 + }, + { + "epoch": 3.120866590649943, + "grad_norm": 6.4375, + "learning_rate": 4.120395371843231e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.8783126771450043, + "num_tokens": 146504599.0, + "step": 1370 + }, + { + "epoch": 3.1231470923603193, + "grad_norm": 3.578125, + "learning_rate": 4.11896065500153e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8771827965974808, + "num_tokens": 146611271.0, + "step": 1371 + }, + { + "epoch": 3.1254275940706955, + "grad_norm": 3.171875, + "learning_rate": 4.117525019269016e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.8794733434915543, + "num_tokens": 146718261.0, + "step": 1372 + }, + { + "epoch": 3.1277080957810717, + "grad_norm": 2.65625, + "learning_rate": 4.116088465460529e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8812500983476639, + "num_tokens": 146825123.0, + "step": 1373 + }, + { + "epoch": 3.1299885974914483, + "grad_norm": 3.25, + "learning_rate": 4.114650994391428e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8858306407928467, + "num_tokens": 146932562.0, + "step": 1374 + }, + { + "epoch": 3.1322690992018245, + "grad_norm": 4.84375, + "learning_rate": 4.113212606877596e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8762167394161224, + "num_tokens": 147039235.0, + "step": 1375 + }, + { + "epoch": 3.1345496009122007, + "grad_norm": 2.5625, + "learning_rate": 4.111773303735432e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.8757900893688202, + "num_tokens": 147146449.0, + "step": 1376 + }, + { + "epoch": 3.136830102622577, + "grad_norm": 3.65625, + "learning_rate": 4.110333085781857e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8781153112649918, + "num_tokens": 147253224.0, + "step": 1377 + }, + { + "epoch": 3.139110604332953, + "grad_norm": 2.921875, + "learning_rate": 4.108891953834312e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8766528069972992, + "num_tokens": 147359809.0, + "step": 1378 + }, + { + "epoch": 3.1413911060433297, + "grad_norm": 3.671875, + "learning_rate": 4.107449908710753e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8773281574249268, + "num_tokens": 147467375.0, + "step": 1379 + }, + { + "epoch": 3.143671607753706, + "grad_norm": 4.40625, + "learning_rate": 4.106006951229661e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8796637058258057, + "num_tokens": 147574319.0, + "step": 1380 + }, + { + "epoch": 3.145952109464082, + "grad_norm": 3.8125, + "learning_rate": 4.104563082210028e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8800371438264847, + "num_tokens": 147681603.0, + "step": 1381 + }, + { + "epoch": 3.1482326111744583, + "grad_norm": 3.734375, + "learning_rate": 4.1031183024713665e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8773266673088074, + "num_tokens": 147788610.0, + "step": 1382 + }, + { + "epoch": 3.1505131128848345, + "grad_norm": 4.5, + "learning_rate": 4.101672612833706e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8835958689451218, + "num_tokens": 147895321.0, + "step": 1383 + }, + { + "epoch": 3.152793614595211, + "grad_norm": 4.75, + "learning_rate": 4.100226014117592e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.8747086226940155, + "num_tokens": 148002099.0, + "step": 1384 + }, + { + "epoch": 3.1550741163055873, + "grad_norm": 4.0, + "learning_rate": 4.098778507144086e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.883410856127739, + "num_tokens": 148109199.0, + "step": 1385 + }, + { + "epoch": 3.1573546180159635, + "grad_norm": 3.6875, + "learning_rate": 4.097330092734765e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8802912831306458, + "num_tokens": 148215632.0, + "step": 1386 + }, + { + "epoch": 3.1596351197263397, + "grad_norm": 6.375, + "learning_rate": 4.09588077171172e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8851134181022644, + "num_tokens": 148323133.0, + "step": 1387 + }, + { + "epoch": 3.161915621436716, + "grad_norm": 6.21875, + "learning_rate": 4.094430544897559e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.8810424953699112, + "num_tokens": 148429831.0, + "step": 1388 + }, + { + "epoch": 3.1641961231470925, + "grad_norm": 6.0625, + "learning_rate": 4.092979413115404e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8824973404407501, + "num_tokens": 148536848.0, + "step": 1389 + }, + { + "epoch": 3.1664766248574687, + "grad_norm": 4.21875, + "learning_rate": 4.091527377188886e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.8795595914125443, + "num_tokens": 148644319.0, + "step": 1390 + }, + { + "epoch": 3.168757126567845, + "grad_norm": 3.640625, + "learning_rate": 4.090074437942155e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.879493236541748, + "num_tokens": 148751953.0, + "step": 1391 + }, + { + "epoch": 3.171037628278221, + "grad_norm": 5.0625, + "learning_rate": 4.088620596199872e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8819226622581482, + "num_tokens": 148859244.0, + "step": 1392 + }, + { + "epoch": 3.1733181299885973, + "grad_norm": 5.15625, + "learning_rate": 4.087165852787206e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8783839493989944, + "num_tokens": 148966518.0, + "step": 1393 + }, + { + "epoch": 3.175598631698974, + "grad_norm": 6.09375, + "learning_rate": 4.085710208529844e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8783328980207443, + "num_tokens": 149074306.0, + "step": 1394 + }, + { + "epoch": 3.17787913340935, + "grad_norm": 2.671875, + "learning_rate": 4.084253664253981e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.8816855847835541, + "num_tokens": 149181408.0, + "step": 1395 + }, + { + "epoch": 3.1801596351197263, + "grad_norm": 4.6875, + "learning_rate": 4.082796220786324e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8770945519208908, + "num_tokens": 149288874.0, + "step": 1396 + }, + { + "epoch": 3.1824401368301025, + "grad_norm": 4.09375, + "learning_rate": 4.081337878954088e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8788999617099762, + "num_tokens": 149395972.0, + "step": 1397 + }, + { + "epoch": 3.1847206385404787, + "grad_norm": 2.25, + "learning_rate": 4.079878639585002e-06, + "loss": 0.563, + "mean_token_accuracy": 0.884684830904007, + "num_tokens": 149503225.0, + "step": 1398 + }, + { + "epoch": 3.1870011402508553, + "grad_norm": 3.234375, + "learning_rate": 4.0784185035072996e-06, + "loss": 0.593, + "mean_token_accuracy": 0.8796551078557968, + "num_tokens": 149609665.0, + "step": 1399 + }, + { + "epoch": 3.1892816419612315, + "grad_norm": 2.59375, + "learning_rate": 4.076957471549728e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.8772178441286087, + "num_tokens": 149716324.0, + "step": 1400 + }, + { + "epoch": 3.1915621436716077, + "grad_norm": 3.4375, + "learning_rate": 4.0754955445415405e-06, + "loss": 0.5978, + "mean_token_accuracy": 0.8765067309141159, + "num_tokens": 149823176.0, + "step": 1401 + }, + { + "epoch": 3.193842645381984, + "grad_norm": 3.296875, + "learning_rate": 4.074032723312497e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8809285014867783, + "num_tokens": 149930057.0, + "step": 1402 + }, + { + "epoch": 3.19612314709236, + "grad_norm": 3.484375, + "learning_rate": 4.072569008692868e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8809255510568619, + "num_tokens": 150036837.0, + "step": 1403 + }, + { + "epoch": 3.1984036488027368, + "grad_norm": 2.53125, + "learning_rate": 4.071104401513429e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8793366551399231, + "num_tokens": 150144003.0, + "step": 1404 + }, + { + "epoch": 3.200684150513113, + "grad_norm": 3.0625, + "learning_rate": 4.069638902605464e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8811514675617218, + "num_tokens": 150252100.0, + "step": 1405 + }, + { + "epoch": 3.202964652223489, + "grad_norm": 2.953125, + "learning_rate": 4.06817251280076e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8807378858327866, + "num_tokens": 150359077.0, + "step": 1406 + }, + { + "epoch": 3.2052451539338653, + "grad_norm": 2.78125, + "learning_rate": 4.0667052329316125e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8794204890727997, + "num_tokens": 150466473.0, + "step": 1407 + }, + { + "epoch": 3.2075256556442415, + "grad_norm": 2.46875, + "learning_rate": 4.0652370638308215e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8787625581026077, + "num_tokens": 150572929.0, + "step": 1408 + }, + { + "epoch": 3.209806157354618, + "grad_norm": 3.6875, + "learning_rate": 4.063768006331691e-06, + "loss": 0.598, + "mean_token_accuracy": 0.874964028596878, + "num_tokens": 150679963.0, + "step": 1409 + }, + { + "epoch": 3.2120866590649944, + "grad_norm": 6.4375, + "learning_rate": 4.06229806126803e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.882831260561943, + "num_tokens": 150787240.0, + "step": 1410 + }, + { + "epoch": 3.2143671607753705, + "grad_norm": 2.921875, + "learning_rate": 4.06082722947415e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8842821419239044, + "num_tokens": 150894724.0, + "step": 1411 + }, + { + "epoch": 3.2166476624857467, + "grad_norm": 6.09375, + "learning_rate": 4.059355511784868e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8783928453922272, + "num_tokens": 151001455.0, + "step": 1412 + }, + { + "epoch": 3.2189281641961234, + "grad_norm": 5.0625, + "learning_rate": 4.057882909035503e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8786003589630127, + "num_tokens": 151108702.0, + "step": 1413 + }, + { + "epoch": 3.2212086659064996, + "grad_norm": 4.75, + "learning_rate": 4.0564094220618735e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8806807845830917, + "num_tokens": 151215645.0, + "step": 1414 + }, + { + "epoch": 3.2234891676168758, + "grad_norm": 3.53125, + "learning_rate": 4.054935051700305e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8828134685754776, + "num_tokens": 151322626.0, + "step": 1415 + }, + { + "epoch": 3.225769669327252, + "grad_norm": 2.546875, + "learning_rate": 4.053459798787619e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8766085356473923, + "num_tokens": 151429877.0, + "step": 1416 + }, + { + "epoch": 3.228050171037628, + "grad_norm": 4.34375, + "learning_rate": 4.0519836641611425e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.8795545697212219, + "num_tokens": 151536994.0, + "step": 1417 + }, + { + "epoch": 3.2303306727480043, + "grad_norm": 5.6875, + "learning_rate": 4.050506648658701e-06, + "loss": 0.5921, + "mean_token_accuracy": 0.8756987452507019, + "num_tokens": 151643935.0, + "step": 1418 + }, + { + "epoch": 3.232611174458381, + "grad_norm": 4.03125, + "learning_rate": 4.049028753118619e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.8784631341695786, + "num_tokens": 151751200.0, + "step": 1419 + }, + { + "epoch": 3.234891676168757, + "grad_norm": 3.84375, + "learning_rate": 4.047549978379721e-06, + "loss": 0.6032, + "mean_token_accuracy": 0.8733428716659546, + "num_tokens": 151858059.0, + "step": 1420 + }, + { + "epoch": 3.2371721778791334, + "grad_norm": 2.421875, + "learning_rate": 4.046070325281333e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8788390904664993, + "num_tokens": 151965037.0, + "step": 1421 + }, + { + "epoch": 3.2394526795895096, + "grad_norm": 4.6875, + "learning_rate": 4.044589794663275e-06, + "loss": 0.6012, + "mean_token_accuracy": 0.8751859962940216, + "num_tokens": 152072452.0, + "step": 1422 + }, + { + "epoch": 3.241733181299886, + "grad_norm": 3.140625, + "learning_rate": 4.04310838736587e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8823445439338684, + "num_tokens": 152180256.0, + "step": 1423 + }, + { + "epoch": 3.2440136830102624, + "grad_norm": 5.3125, + "learning_rate": 4.041626104229937e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8832966238260269, + "num_tokens": 152287148.0, + "step": 1424 + }, + { + "epoch": 3.2462941847206386, + "grad_norm": 2.4375, + "learning_rate": 4.0401429460967864e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8766773343086243, + "num_tokens": 152393325.0, + "step": 1425 + }, + { + "epoch": 3.2485746864310148, + "grad_norm": 2.609375, + "learning_rate": 4.038658913808235e-06, + "loss": 0.595, + "mean_token_accuracy": 0.877457782626152, + "num_tokens": 152500677.0, + "step": 1426 + }, + { + "epoch": 3.250855188141391, + "grad_norm": 3.671875, + "learning_rate": 4.037174008206589e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8779526948928833, + "num_tokens": 152606926.0, + "step": 1427 + }, + { + "epoch": 3.253135689851767, + "grad_norm": 2.875, + "learning_rate": 4.035688230134651e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.8811050057411194, + "num_tokens": 152713975.0, + "step": 1428 + }, + { + "epoch": 3.255416191562144, + "grad_norm": 3.046875, + "learning_rate": 4.034201580435723e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8812618553638458, + "num_tokens": 152820865.0, + "step": 1429 + }, + { + "epoch": 3.25769669327252, + "grad_norm": 3.0, + "learning_rate": 4.0327140599535954e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.8711613863706589, + "num_tokens": 152927618.0, + "step": 1430 + }, + { + "epoch": 3.259977194982896, + "grad_norm": 2.296875, + "learning_rate": 4.031225669532558e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.8751797676086426, + "num_tokens": 153034150.0, + "step": 1431 + }, + { + "epoch": 3.2622576966932724, + "grad_norm": 2.546875, + "learning_rate": 4.029736410017392e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8789128512144089, + "num_tokens": 153141189.0, + "step": 1432 + }, + { + "epoch": 3.264538198403649, + "grad_norm": 4.5, + "learning_rate": 4.028246282253373e-06, + "loss": 0.5939, + "mean_token_accuracy": 0.8788965493440628, + "num_tokens": 153247856.0, + "step": 1433 + }, + { + "epoch": 3.266818700114025, + "grad_norm": 2.828125, + "learning_rate": 4.026755287086267e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8801927268505096, + "num_tokens": 153355189.0, + "step": 1434 + }, + { + "epoch": 3.2690992018244014, + "grad_norm": 6.34375, + "learning_rate": 4.025263425362335e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8794397115707397, + "num_tokens": 153461890.0, + "step": 1435 + }, + { + "epoch": 3.2713797035347776, + "grad_norm": 3.953125, + "learning_rate": 4.0237706979283306e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8837473094463348, + "num_tokens": 153568765.0, + "step": 1436 + }, + { + "epoch": 3.2736602052451538, + "grad_norm": 3.390625, + "learning_rate": 4.022277105631495e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.882627084851265, + "num_tokens": 153675854.0, + "step": 1437 + }, + { + "epoch": 3.27594070695553, + "grad_norm": 2.515625, + "learning_rate": 4.020782649319563e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8796237707138062, + "num_tokens": 153783094.0, + "step": 1438 + }, + { + "epoch": 3.2782212086659066, + "grad_norm": 3.3125, + "learning_rate": 4.019287329840759e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.8813715130090714, + "num_tokens": 153889764.0, + "step": 1439 + }, + { + "epoch": 3.280501710376283, + "grad_norm": 2.8125, + "learning_rate": 4.017791148043797e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.882407933473587, + "num_tokens": 153996810.0, + "step": 1440 + }, + { + "epoch": 3.282782212086659, + "grad_norm": 2.578125, + "learning_rate": 4.016294104777883e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8812000006437302, + "num_tokens": 154103786.0, + "step": 1441 + }, + { + "epoch": 3.285062713797035, + "grad_norm": 2.265625, + "learning_rate": 4.0147962008927065e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8824433386325836, + "num_tokens": 154211857.0, + "step": 1442 + }, + { + "epoch": 3.287343215507412, + "grad_norm": 3.34375, + "learning_rate": 4.013297437238452e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8849389404058456, + "num_tokens": 154318483.0, + "step": 1443 + }, + { + "epoch": 3.289623717217788, + "grad_norm": 3.609375, + "learning_rate": 4.011797814665787e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8829954713582993, + "num_tokens": 154425544.0, + "step": 1444 + }, + { + "epoch": 3.291904218928164, + "grad_norm": 2.375, + "learning_rate": 4.010297334025869e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8852455317974091, + "num_tokens": 154532774.0, + "step": 1445 + }, + { + "epoch": 3.2941847206385404, + "grad_norm": 2.4375, + "learning_rate": 4.008795996170341e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8847774416208267, + "num_tokens": 154639810.0, + "step": 1446 + }, + { + "epoch": 3.2964652223489166, + "grad_norm": 2.46875, + "learning_rate": 4.0072938019513345e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.8774778991937637, + "num_tokens": 154747419.0, + "step": 1447 + }, + { + "epoch": 3.2987457240592932, + "grad_norm": 3.328125, + "learning_rate": 4.0057907522214646e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8795448988676071, + "num_tokens": 154855354.0, + "step": 1448 + }, + { + "epoch": 3.3010262257696694, + "grad_norm": 3.140625, + "learning_rate": 4.004286847833835e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.8828533291816711, + "num_tokens": 154962700.0, + "step": 1449 + }, + { + "epoch": 3.3033067274800456, + "grad_norm": 4.0625, + "learning_rate": 4.002782089642031e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8755003958940506, + "num_tokens": 155069571.0, + "step": 1450 + }, + { + "epoch": 3.305587229190422, + "grad_norm": 2.375, + "learning_rate": 4.001276478500127e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8879126906394958, + "num_tokens": 155177593.0, + "step": 1451 + }, + { + "epoch": 3.307867730900798, + "grad_norm": 2.671875, + "learning_rate": 3.9997700152626755e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8830482810735703, + "num_tokens": 155284329.0, + "step": 1452 + }, + { + "epoch": 3.3101482326111746, + "grad_norm": 8.375, + "learning_rate": 3.9982627007847186e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8783923089504242, + "num_tokens": 155392445.0, + "step": 1453 + }, + { + "epoch": 3.312428734321551, + "grad_norm": 4.5625, + "learning_rate": 3.996754535921777e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.8857136368751526, + "num_tokens": 155499630.0, + "step": 1454 + }, + { + "epoch": 3.314709236031927, + "grad_norm": 3.1875, + "learning_rate": 3.995245521529857e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.8815815448760986, + "num_tokens": 155606583.0, + "step": 1455 + }, + { + "epoch": 3.316989737742303, + "grad_norm": 2.78125, + "learning_rate": 3.993735658465446e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.880055382847786, + "num_tokens": 155712881.0, + "step": 1456 + }, + { + "epoch": 3.3192702394526794, + "grad_norm": 4.375, + "learning_rate": 3.992224947585513e-06, + "loss": 0.6071, + "mean_token_accuracy": 0.8751248270273209, + "num_tokens": 155820166.0, + "step": 1457 + }, + { + "epoch": 3.321550741163056, + "grad_norm": 8.3125, + "learning_rate": 3.990713389747508e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8792251795530319, + "num_tokens": 155927043.0, + "step": 1458 + }, + { + "epoch": 3.3238312428734322, + "grad_norm": 6.21875, + "learning_rate": 3.989200985809362e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8834533840417862, + "num_tokens": 156034044.0, + "step": 1459 + }, + { + "epoch": 3.3261117445838084, + "grad_norm": 3.71875, + "learning_rate": 3.987687736629487e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.8784044981002808, + "num_tokens": 156140733.0, + "step": 1460 + }, + { + "epoch": 3.3283922462941846, + "grad_norm": 3.71875, + "learning_rate": 3.986173643066774e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8763966262340546, + "num_tokens": 156247552.0, + "step": 1461 + }, + { + "epoch": 3.330672748004561, + "grad_norm": 2.9375, + "learning_rate": 3.984658705980593e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8726420104503632, + "num_tokens": 156355329.0, + "step": 1462 + }, + { + "epoch": 3.3329532497149374, + "grad_norm": 5.34375, + "learning_rate": 3.983142926230792e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.880921483039856, + "num_tokens": 156462676.0, + "step": 1463 + }, + { + "epoch": 3.3352337514253136, + "grad_norm": 3.921875, + "learning_rate": 3.981626304677701e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.8802833706140518, + "num_tokens": 156570120.0, + "step": 1464 + }, + { + "epoch": 3.33751425313569, + "grad_norm": 3.796875, + "learning_rate": 3.980108842182121e-06, + "loss": 0.605, + "mean_token_accuracy": 0.8745223730802536, + "num_tokens": 156677584.0, + "step": 1465 + }, + { + "epoch": 3.339794754846066, + "grad_norm": 3.453125, + "learning_rate": 3.978590539605338e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8785550892353058, + "num_tokens": 156785538.0, + "step": 1466 + }, + { + "epoch": 3.342075256556442, + "grad_norm": 3.109375, + "learning_rate": 3.97707139780911e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8855056762695312, + "num_tokens": 156893407.0, + "step": 1467 + }, + { + "epoch": 3.344355758266819, + "grad_norm": 2.546875, + "learning_rate": 3.975551417655673e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.8805434554815292, + "num_tokens": 157000135.0, + "step": 1468 + }, + { + "epoch": 3.346636259977195, + "grad_norm": 2.703125, + "learning_rate": 3.974030600007737e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.880170151591301, + "num_tokens": 157106985.0, + "step": 1469 + }, + { + "epoch": 3.3489167616875712, + "grad_norm": 2.421875, + "learning_rate": 3.97250894572849e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.8791963756084442, + "num_tokens": 157213948.0, + "step": 1470 + }, + { + "epoch": 3.3511972633979474, + "grad_norm": 5.0625, + "learning_rate": 3.970986455681593e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8822743445634842, + "num_tokens": 157321436.0, + "step": 1471 + }, + { + "epoch": 3.353477765108324, + "grad_norm": 2.46875, + "learning_rate": 3.969463130731183e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.8710167407989502, + "num_tokens": 157427856.0, + "step": 1472 + }, + { + "epoch": 3.3557582668187003, + "grad_norm": 2.78125, + "learning_rate": 3.967938971741869e-06, + "loss": 0.597, + "mean_token_accuracy": 0.880034327507019, + "num_tokens": 157535511.0, + "step": 1473 + }, + { + "epoch": 3.3580387685290765, + "grad_norm": 4.65625, + "learning_rate": 3.966413979578734e-06, + "loss": 0.5949, + "mean_token_accuracy": 0.8794209510087967, + "num_tokens": 157642677.0, + "step": 1474 + }, + { + "epoch": 3.3603192702394526, + "grad_norm": 2.59375, + "learning_rate": 3.964888155107335e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8807871788740158, + "num_tokens": 157749664.0, + "step": 1475 + }, + { + "epoch": 3.362599771949829, + "grad_norm": 3.578125, + "learning_rate": 3.963361499193699e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8778406232595444, + "num_tokens": 157857160.0, + "step": 1476 + }, + { + "epoch": 3.364880273660205, + "grad_norm": 2.359375, + "learning_rate": 3.9618340127043274e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.881255105137825, + "num_tokens": 157964040.0, + "step": 1477 + }, + { + "epoch": 3.3671607753705817, + "grad_norm": 3.515625, + "learning_rate": 3.960305696506192e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8831611126661301, + "num_tokens": 158071025.0, + "step": 1478 + }, + { + "epoch": 3.369441277080958, + "grad_norm": 2.84375, + "learning_rate": 3.958776551466737e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8826898485422134, + "num_tokens": 158177813.0, + "step": 1479 + }, + { + "epoch": 3.371721778791334, + "grad_norm": 2.53125, + "learning_rate": 3.957246578453873e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.8766767531633377, + "num_tokens": 158284056.0, + "step": 1480 + }, + { + "epoch": 3.3740022805017102, + "grad_norm": 4.65625, + "learning_rate": 3.955715778335984e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.882177472114563, + "num_tokens": 158390740.0, + "step": 1481 + }, + { + "epoch": 3.376282782212087, + "grad_norm": 2.859375, + "learning_rate": 3.954184151981924e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8798738569021225, + "num_tokens": 158497577.0, + "step": 1482 + }, + { + "epoch": 3.378563283922463, + "grad_norm": 3.203125, + "learning_rate": 3.952651700261012e-06, + "loss": 0.5969, + "mean_token_accuracy": 0.8804901242256165, + "num_tokens": 158604558.0, + "step": 1483 + }, + { + "epoch": 3.3808437856328393, + "grad_norm": 3.640625, + "learning_rate": 3.95111842404304e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8816994279623032, + "num_tokens": 158712603.0, + "step": 1484 + }, + { + "epoch": 3.3831242873432155, + "grad_norm": 2.59375, + "learning_rate": 3.949584324198266e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8765831738710403, + "num_tokens": 158819917.0, + "step": 1485 + }, + { + "epoch": 3.3854047890535917, + "grad_norm": 2.828125, + "learning_rate": 3.948049401597414e-06, + "loss": 0.576, + "mean_token_accuracy": 0.881806880235672, + "num_tokens": 158927113.0, + "step": 1486 + }, + { + "epoch": 3.387685290763968, + "grad_norm": 3.578125, + "learning_rate": 3.946513657111678e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.881575807929039, + "num_tokens": 159034584.0, + "step": 1487 + }, + { + "epoch": 3.3899657924743445, + "grad_norm": 2.40625, + "learning_rate": 3.944977091612716e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.880961686372757, + "num_tokens": 159141695.0, + "step": 1488 + }, + { + "epoch": 3.3922462941847207, + "grad_norm": 2.734375, + "learning_rate": 3.943439705972654e-06, + "loss": 0.604, + "mean_token_accuracy": 0.8772746175527573, + "num_tokens": 159248763.0, + "step": 1489 + }, + { + "epoch": 3.394526795895097, + "grad_norm": 3.015625, + "learning_rate": 3.94190150106408e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8807346522808075, + "num_tokens": 159355766.0, + "step": 1490 + }, + { + "epoch": 3.396807297605473, + "grad_norm": 3.671875, + "learning_rate": 3.9403624777600526e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8799644559621811, + "num_tokens": 159462660.0, + "step": 1491 + }, + { + "epoch": 3.3990877993158497, + "grad_norm": 4.15625, + "learning_rate": 3.938822636934089e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8790238797664642, + "num_tokens": 159569562.0, + "step": 1492 + }, + { + "epoch": 3.401368301026226, + "grad_norm": 7.1875, + "learning_rate": 3.937281979460175e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8814513683319092, + "num_tokens": 159676589.0, + "step": 1493 + }, + { + "epoch": 3.403648802736602, + "grad_norm": 3.859375, + "learning_rate": 3.9357405062127565e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8827214390039444, + "num_tokens": 159783562.0, + "step": 1494 + }, + { + "epoch": 3.4059293044469783, + "grad_norm": 2.875, + "learning_rate": 3.934198218066745e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.882139042019844, + "num_tokens": 159891070.0, + "step": 1495 + }, + { + "epoch": 3.4082098061573545, + "grad_norm": 6.34375, + "learning_rate": 3.932655115897513e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8820941895246506, + "num_tokens": 159997890.0, + "step": 1496 + }, + { + "epoch": 3.4104903078677307, + "grad_norm": 4.0, + "learning_rate": 3.9311112005808955e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8780702501535416, + "num_tokens": 160104922.0, + "step": 1497 + }, + { + "epoch": 3.4127708095781073, + "grad_norm": 6.15625, + "learning_rate": 3.92956647299319e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8826514631509781, + "num_tokens": 160211984.0, + "step": 1498 + }, + { + "epoch": 3.4150513112884835, + "grad_norm": 3.515625, + "learning_rate": 3.928020934011153e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.876240611076355, + "num_tokens": 160319031.0, + "step": 1499 + }, + { + "epoch": 3.4173318129988597, + "grad_norm": 2.296875, + "learning_rate": 3.926474584512002e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8836843520402908, + "num_tokens": 160425892.0, + "step": 1500 + }, + { + "epoch": 3.419612314709236, + "grad_norm": 2.921875, + "learning_rate": 3.924927425373417e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8801800161600113, + "num_tokens": 160533225.0, + "step": 1501 + }, + { + "epoch": 3.4218928164196125, + "grad_norm": 7.0625, + "learning_rate": 3.9233794574735345e-06, + "loss": 0.5941, + "mean_token_accuracy": 0.8788256347179413, + "num_tokens": 160640613.0, + "step": 1502 + }, + { + "epoch": 3.4241733181299887, + "grad_norm": 5.8125, + "learning_rate": 3.921830681690951e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.8749442100524902, + "num_tokens": 160746763.0, + "step": 1503 + }, + { + "epoch": 3.426453819840365, + "grad_norm": 2.8125, + "learning_rate": 3.920281098904722e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.8785606771707535, + "num_tokens": 160854121.0, + "step": 1504 + }, + { + "epoch": 3.428734321550741, + "grad_norm": 2.515625, + "learning_rate": 3.918730709994361e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8799080401659012, + "num_tokens": 160961324.0, + "step": 1505 + }, + { + "epoch": 3.4310148232611173, + "grad_norm": 3.15625, + "learning_rate": 3.91717951583984e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8835075944662094, + "num_tokens": 161068109.0, + "step": 1506 + }, + { + "epoch": 3.433295324971494, + "grad_norm": 2.609375, + "learning_rate": 3.915627517321584e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8793487548828125, + "num_tokens": 161175558.0, + "step": 1507 + }, + { + "epoch": 3.43557582668187, + "grad_norm": 5.03125, + "learning_rate": 3.914074715320479e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8839425444602966, + "num_tokens": 161282715.0, + "step": 1508 + }, + { + "epoch": 3.4378563283922463, + "grad_norm": 4.15625, + "learning_rate": 3.912521110717866e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.879220187664032, + "num_tokens": 161390083.0, + "step": 1509 + }, + { + "epoch": 3.4401368301026225, + "grad_norm": 3.1875, + "learning_rate": 3.9109667043955405e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8790208101272583, + "num_tokens": 161497044.0, + "step": 1510 + }, + { + "epoch": 3.4424173318129987, + "grad_norm": 3.78125, + "learning_rate": 3.909411497235752e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8846622407436371, + "num_tokens": 161603896.0, + "step": 1511 + }, + { + "epoch": 3.4446978335233753, + "grad_norm": 4.4375, + "learning_rate": 3.907855490121208e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8800790905952454, + "num_tokens": 161710931.0, + "step": 1512 + }, + { + "epoch": 3.4469783352337515, + "grad_norm": 3.859375, + "learning_rate": 3.906298683935068e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.8813421279191971, + "num_tokens": 161817975.0, + "step": 1513 + }, + { + "epoch": 3.4492588369441277, + "grad_norm": 2.4375, + "learning_rate": 3.904741079560944e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8818980902433395, + "num_tokens": 161925410.0, + "step": 1514 + }, + { + "epoch": 3.451539338654504, + "grad_norm": 2.6875, + "learning_rate": 3.903182677882904e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8834614157676697, + "num_tokens": 162032741.0, + "step": 1515 + }, + { + "epoch": 3.45381984036488, + "grad_norm": 3.875, + "learning_rate": 3.901623479785465e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8787531405687332, + "num_tokens": 162139480.0, + "step": 1516 + }, + { + "epoch": 3.4561003420752567, + "grad_norm": 3.03125, + "learning_rate": 3.900063486153598e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8862394094467163, + "num_tokens": 162246597.0, + "step": 1517 + }, + { + "epoch": 3.458380843785633, + "grad_norm": 3.0625, + "learning_rate": 3.898502697872725e-06, + "loss": 0.5941, + "mean_token_accuracy": 0.8790966123342514, + "num_tokens": 162353845.0, + "step": 1518 + }, + { + "epoch": 3.460661345496009, + "grad_norm": 2.421875, + "learning_rate": 3.896941115828721e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8806100636720657, + "num_tokens": 162460754.0, + "step": 1519 + }, + { + "epoch": 3.4629418472063853, + "grad_norm": 3.9375, + "learning_rate": 3.895378740907908e-06, + "loss": 0.6153, + "mean_token_accuracy": 0.8727654367685318, + "num_tokens": 162567222.0, + "step": 1520 + }, + { + "epoch": 3.4652223489167615, + "grad_norm": 2.671875, + "learning_rate": 3.89381557399706e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8774588257074356, + "num_tokens": 162674330.0, + "step": 1521 + }, + { + "epoch": 3.467502850627138, + "grad_norm": 2.515625, + "learning_rate": 3.892251615983401e-06, + "loss": 0.587, + "mean_token_accuracy": 0.8811975121498108, + "num_tokens": 162781710.0, + "step": 1522 + }, + { + "epoch": 3.4697833523375143, + "grad_norm": 2.65625, + "learning_rate": 3.890686867754604e-06, + "loss": 0.6, + "mean_token_accuracy": 0.8789321780204773, + "num_tokens": 162888560.0, + "step": 1523 + }, + { + "epoch": 3.4720638540478905, + "grad_norm": 2.546875, + "learning_rate": 3.889121330198788e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8869301974773407, + "num_tokens": 162995694.0, + "step": 1524 + }, + { + "epoch": 3.4743443557582667, + "grad_norm": 2.578125, + "learning_rate": 3.887555004204524e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8777419775724411, + "num_tokens": 163103255.0, + "step": 1525 + }, + { + "epoch": 3.476624857468643, + "grad_norm": 2.796875, + "learning_rate": 3.885987890660828e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8802089840173721, + "num_tokens": 163210670.0, + "step": 1526 + }, + { + "epoch": 3.4789053591790196, + "grad_norm": 3.234375, + "learning_rate": 3.884419990457161e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8770112693309784, + "num_tokens": 163317663.0, + "step": 1527 + }, + { + "epoch": 3.4811858608893957, + "grad_norm": 3.0, + "learning_rate": 3.882851304483436e-06, + "loss": 0.599, + "mean_token_accuracy": 0.8784664124250412, + "num_tokens": 163424638.0, + "step": 1528 + }, + { + "epoch": 3.483466362599772, + "grad_norm": 2.75, + "learning_rate": 3.881281833630007e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8907039016485214, + "num_tokens": 163532458.0, + "step": 1529 + }, + { + "epoch": 3.485746864310148, + "grad_norm": 3.9375, + "learning_rate": 3.879711578787676e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8753378838300705, + "num_tokens": 163639976.0, + "step": 1530 + }, + { + "epoch": 3.4880273660205243, + "grad_norm": 2.625, + "learning_rate": 3.87814054084769e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8845688998699188, + "num_tokens": 163747328.0, + "step": 1531 + }, + { + "epoch": 3.490307867730901, + "grad_norm": 3.015625, + "learning_rate": 3.8765687207017375e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8783632963895798, + "num_tokens": 163854482.0, + "step": 1532 + }, + { + "epoch": 3.492588369441277, + "grad_norm": 2.4375, + "learning_rate": 3.874996119241956e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8804601579904556, + "num_tokens": 163961669.0, + "step": 1533 + }, + { + "epoch": 3.4948688711516533, + "grad_norm": 3.125, + "learning_rate": 3.873422737360922e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8807873427867889, + "num_tokens": 164068614.0, + "step": 1534 + }, + { + "epoch": 3.4971493728620295, + "grad_norm": 2.609375, + "learning_rate": 3.871848575951658e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8857271820306778, + "num_tokens": 164175640.0, + "step": 1535 + }, + { + "epoch": 3.4994298745724057, + "grad_norm": 3.546875, + "learning_rate": 3.8702736359076265e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8821373879909515, + "num_tokens": 164283025.0, + "step": 1536 + }, + { + "epoch": 3.5017103762827824, + "grad_norm": 2.140625, + "learning_rate": 3.868697918122733e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8805082738399506, + "num_tokens": 164389945.0, + "step": 1537 + }, + { + "epoch": 3.5039908779931586, + "grad_norm": 4.4375, + "learning_rate": 3.867121423491325e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8843076974153519, + "num_tokens": 164497358.0, + "step": 1538 + }, + { + "epoch": 3.5062713797035348, + "grad_norm": 4.71875, + "learning_rate": 3.86554415290819e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.8772299587726593, + "num_tokens": 164604547.0, + "step": 1539 + }, + { + "epoch": 3.508551881413911, + "grad_norm": 3.0, + "learning_rate": 3.8639661072685575e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8764625191688538, + "num_tokens": 164711483.0, + "step": 1540 + }, + { + "epoch": 3.508551881413911, + "eval_loss": 0.5934817790985107, + "eval_mean_token_accuracy": 0.8786315757058872, + "eval_num_tokens": 164711483.0, + "eval_runtime": 58.6311, + "eval_samples_per_second": 143.013, + "eval_steps_per_second": 4.486, + "step": 1540 + }, + { + "epoch": 3.5108323831242876, + "grad_norm": 2.46875, + "learning_rate": 3.862387287468095e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.884019061923027, + "num_tokens": 164818766.0, + "step": 1541 + }, + { + "epoch": 3.5131128848346638, + "grad_norm": 3.03125, + "learning_rate": 3.860807694402909e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.872483566403389, + "num_tokens": 164925458.0, + "step": 1542 + }, + { + "epoch": 3.51539338654504, + "grad_norm": 2.46875, + "learning_rate": 3.859227328969547e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8828099370002747, + "num_tokens": 165033022.0, + "step": 1543 + }, + { + "epoch": 3.517673888255416, + "grad_norm": 4.09375, + "learning_rate": 3.857646192064995e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.885229229927063, + "num_tokens": 165140269.0, + "step": 1544 + }, + { + "epoch": 3.5199543899657924, + "grad_norm": 2.640625, + "learning_rate": 3.856064284586674e-06, + "loss": 0.581, + "mean_token_accuracy": 0.878631979227066, + "num_tokens": 165247172.0, + "step": 1545 + }, + { + "epoch": 3.5222348916761685, + "grad_norm": 4.84375, + "learning_rate": 3.854481607432445e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.881737545132637, + "num_tokens": 165353974.0, + "step": 1546 + }, + { + "epoch": 3.524515393386545, + "grad_norm": 2.609375, + "learning_rate": 3.852898161500605e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8790680915117264, + "num_tokens": 165461254.0, + "step": 1547 + }, + { + "epoch": 3.5267958950969214, + "grad_norm": 3.09375, + "learning_rate": 3.851313947689888e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8754421025514603, + "num_tokens": 165568029.0, + "step": 1548 + }, + { + "epoch": 3.5290763968072976, + "grad_norm": 2.84375, + "learning_rate": 3.849728966899462e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.8801588118076324, + "num_tokens": 165675255.0, + "step": 1549 + }, + { + "epoch": 3.5313568985176738, + "grad_norm": 3.234375, + "learning_rate": 3.848143220028931e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8785509765148163, + "num_tokens": 165781381.0, + "step": 1550 + }, + { + "epoch": 3.5336374002280504, + "grad_norm": 2.203125, + "learning_rate": 3.846556707978337e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8853710889816284, + "num_tokens": 165888356.0, + "step": 1551 + }, + { + "epoch": 3.5359179019384266, + "grad_norm": 3.515625, + "learning_rate": 3.844969431648151e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8783566206693649, + "num_tokens": 165995028.0, + "step": 1552 + }, + { + "epoch": 3.538198403648803, + "grad_norm": 3.171875, + "learning_rate": 3.843381391939281e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8758636116981506, + "num_tokens": 166102173.0, + "step": 1553 + }, + { + "epoch": 3.540478905359179, + "grad_norm": 2.65625, + "learning_rate": 3.841792589753067e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8812205046415329, + "num_tokens": 166209381.0, + "step": 1554 + }, + { + "epoch": 3.542759407069555, + "grad_norm": 5.65625, + "learning_rate": 3.840203025991285e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.8851082473993301, + "num_tokens": 166316578.0, + "step": 1555 + }, + { + "epoch": 3.5450399087799314, + "grad_norm": 3.484375, + "learning_rate": 3.838612701556138e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.879833921790123, + "num_tokens": 166423915.0, + "step": 1556 + }, + { + "epoch": 3.547320410490308, + "grad_norm": 3.125, + "learning_rate": 3.837021617350266e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8793807029724121, + "num_tokens": 166530907.0, + "step": 1557 + }, + { + "epoch": 3.549600912200684, + "grad_norm": 2.375, + "learning_rate": 3.8354297742767345e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8835282325744629, + "num_tokens": 166638263.0, + "step": 1558 + }, + { + "epoch": 3.5518814139110604, + "grad_norm": 4.15625, + "learning_rate": 3.833837173239044e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8798348754644394, + "num_tokens": 166745468.0, + "step": 1559 + }, + { + "epoch": 3.5541619156214366, + "grad_norm": 3.265625, + "learning_rate": 3.832243815141126e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8829097151756287, + "num_tokens": 166852138.0, + "step": 1560 + }, + { + "epoch": 3.556442417331813, + "grad_norm": 2.703125, + "learning_rate": 3.830649700887339e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.8771270960569382, + "num_tokens": 166958904.0, + "step": 1561 + }, + { + "epoch": 3.5587229190421894, + "grad_norm": 2.4375, + "learning_rate": 3.829054831382471e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8783409595489502, + "num_tokens": 167066230.0, + "step": 1562 + }, + { + "epoch": 3.5610034207525656, + "grad_norm": 2.453125, + "learning_rate": 3.827459207531739e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8838016241788864, + "num_tokens": 167173536.0, + "step": 1563 + }, + { + "epoch": 3.563283922462942, + "grad_norm": 2.625, + "learning_rate": 3.825862830240787e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.8744715452194214, + "num_tokens": 167280709.0, + "step": 1564 + }, + { + "epoch": 3.565564424173318, + "grad_norm": 3.140625, + "learning_rate": 3.82426570041569e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.8770562261343002, + "num_tokens": 167387081.0, + "step": 1565 + }, + { + "epoch": 3.567844925883694, + "grad_norm": 2.71875, + "learning_rate": 3.822667818962948e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8852861523628235, + "num_tokens": 167494025.0, + "step": 1566 + }, + { + "epoch": 3.570125427594071, + "grad_norm": 2.5, + "learning_rate": 3.821069186789486e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8804926127195358, + "num_tokens": 167600832.0, + "step": 1567 + }, + { + "epoch": 3.572405929304447, + "grad_norm": 3.296875, + "learning_rate": 3.819469804802659e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8771549314260483, + "num_tokens": 167707990.0, + "step": 1568 + }, + { + "epoch": 3.574686431014823, + "grad_norm": 2.578125, + "learning_rate": 3.8178696739102435e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8794214427471161, + "num_tokens": 167815132.0, + "step": 1569 + }, + { + "epoch": 3.5769669327251994, + "grad_norm": 5.65625, + "learning_rate": 3.816268795020443e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.8854867666959763, + "num_tokens": 167922544.0, + "step": 1570 + }, + { + "epoch": 3.579247434435576, + "grad_norm": 2.84375, + "learning_rate": 3.814667169041887e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.8798968493938446, + "num_tokens": 168029140.0, + "step": 1571 + }, + { + "epoch": 3.581527936145952, + "grad_norm": 5.09375, + "learning_rate": 3.8130647968836254e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.8785337060689926, + "num_tokens": 168136139.0, + "step": 1572 + }, + { + "epoch": 3.5838084378563284, + "grad_norm": 2.34375, + "learning_rate": 3.811461679455136e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8779959082603455, + "num_tokens": 168242818.0, + "step": 1573 + }, + { + "epoch": 3.5860889395667046, + "grad_norm": 3.765625, + "learning_rate": 3.809857817666316e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.8785370439291, + "num_tokens": 168350600.0, + "step": 1574 + }, + { + "epoch": 3.588369441277081, + "grad_norm": 2.359375, + "learning_rate": 3.808253212427486e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.879870593547821, + "num_tokens": 168457735.0, + "step": 1575 + }, + { + "epoch": 3.590649942987457, + "grad_norm": 2.78125, + "learning_rate": 3.8066478646493898e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8814902901649475, + "num_tokens": 168564725.0, + "step": 1576 + }, + { + "epoch": 3.5929304446978336, + "grad_norm": 4.65625, + "learning_rate": 3.805041775243191e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8772566318511963, + "num_tokens": 168671913.0, + "step": 1577 + }, + { + "epoch": 3.59521094640821, + "grad_norm": 3.53125, + "learning_rate": 3.803434945120475e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.8856948614120483, + "num_tokens": 168779191.0, + "step": 1578 + }, + { + "epoch": 3.597491448118586, + "grad_norm": 2.90625, + "learning_rate": 3.801827375193249e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8798210918903351, + "num_tokens": 168886143.0, + "step": 1579 + }, + { + "epoch": 3.5997719498289626, + "grad_norm": 6.9375, + "learning_rate": 3.8002190663739362e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8814046531915665, + "num_tokens": 168993134.0, + "step": 1580 + }, + { + "epoch": 3.602052451539339, + "grad_norm": 8.6875, + "learning_rate": 3.798610019575384e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8799954205751419, + "num_tokens": 169099760.0, + "step": 1581 + }, + { + "epoch": 3.604332953249715, + "grad_norm": 6.90625, + "learning_rate": 3.7970002357108554e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8843145817518234, + "num_tokens": 169206545.0, + "step": 1582 + }, + { + "epoch": 3.6066134549600912, + "grad_norm": 3.625, + "learning_rate": 3.7953897156940323e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8787747472524643, + "num_tokens": 169313229.0, + "step": 1583 + }, + { + "epoch": 3.6088939566704674, + "grad_norm": 2.75, + "learning_rate": 3.793778460439015e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8789673447608948, + "num_tokens": 169420044.0, + "step": 1584 + }, + { + "epoch": 3.6111744583808436, + "grad_norm": 4.59375, + "learning_rate": 3.792166470860321e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8790552765130997, + "num_tokens": 169526883.0, + "step": 1585 + }, + { + "epoch": 3.61345496009122, + "grad_norm": 4.03125, + "learning_rate": 3.790553747872885e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8849265277385712, + "num_tokens": 169633699.0, + "step": 1586 + }, + { + "epoch": 3.6157354618015964, + "grad_norm": 4.96875, + "learning_rate": 3.788940292392056e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.8801223486661911, + "num_tokens": 169741250.0, + "step": 1587 + }, + { + "epoch": 3.6180159635119726, + "grad_norm": 5.59375, + "learning_rate": 3.787326105333601e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.8809481114149094, + "num_tokens": 169848862.0, + "step": 1588 + }, + { + "epoch": 3.620296465222349, + "grad_norm": 2.453125, + "learning_rate": 3.7857111876137017e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.8804655224084854, + "num_tokens": 169955712.0, + "step": 1589 + }, + { + "epoch": 3.6225769669327255, + "grad_norm": 2.9375, + "learning_rate": 3.784095540148954e-06, + "loss": 0.5981, + "mean_token_accuracy": 0.8755791783332825, + "num_tokens": 170062534.0, + "step": 1590 + }, + { + "epoch": 3.6248574686431017, + "grad_norm": 2.859375, + "learning_rate": 3.7824791638563674e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.8761962950229645, + "num_tokens": 170169349.0, + "step": 1591 + }, + { + "epoch": 3.627137970353478, + "grad_norm": 4.21875, + "learning_rate": 3.7808620596533675e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8823762983083725, + "num_tokens": 170276611.0, + "step": 1592 + }, + { + "epoch": 3.629418472063854, + "grad_norm": 3.234375, + "learning_rate": 3.77924422845779e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.873461440205574, + "num_tokens": 170383192.0, + "step": 1593 + }, + { + "epoch": 3.6316989737742302, + "grad_norm": 2.65625, + "learning_rate": 3.7776256711878856e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8810142874717712, + "num_tokens": 170490101.0, + "step": 1594 + }, + { + "epoch": 3.6339794754846064, + "grad_norm": 2.765625, + "learning_rate": 3.7760063887623155e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8790360242128372, + "num_tokens": 170596909.0, + "step": 1595 + }, + { + "epoch": 3.636259977194983, + "grad_norm": 3.4375, + "learning_rate": 3.7743863821001538e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.886503741145134, + "num_tokens": 170704691.0, + "step": 1596 + }, + { + "epoch": 3.6385404789053593, + "grad_norm": 2.96875, + "learning_rate": 3.7727656521208843e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.8780782669782639, + "num_tokens": 170811447.0, + "step": 1597 + }, + { + "epoch": 3.6408209806157354, + "grad_norm": 3.796875, + "learning_rate": 3.771144199744402e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.8789367228746414, + "num_tokens": 170918012.0, + "step": 1598 + }, + { + "epoch": 3.6431014823261116, + "grad_norm": 2.421875, + "learning_rate": 3.7695220258910124e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8771436512470245, + "num_tokens": 171024919.0, + "step": 1599 + }, + { + "epoch": 3.6453819840364883, + "grad_norm": 3.625, + "learning_rate": 3.7678991314814305e-06, + "loss": 0.6, + "mean_token_accuracy": 0.8739635199308395, + "num_tokens": 171131530.0, + "step": 1600 + }, + { + "epoch": 3.6476624857468645, + "grad_norm": 3.84375, + "learning_rate": 3.766275517436779e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.8780031353235245, + "num_tokens": 171238533.0, + "step": 1601 + }, + { + "epoch": 3.6499429874572407, + "grad_norm": 3.15625, + "learning_rate": 3.7646511846785904e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8816352784633636, + "num_tokens": 171345881.0, + "step": 1602 + }, + { + "epoch": 3.652223489167617, + "grad_norm": 4.0625, + "learning_rate": 3.7630261341288044e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.883026197552681, + "num_tokens": 171452807.0, + "step": 1603 + }, + { + "epoch": 3.654503990877993, + "grad_norm": 4.0, + "learning_rate": 3.7614003667097674e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.8770434111356735, + "num_tokens": 171559794.0, + "step": 1604 + }, + { + "epoch": 3.6567844925883692, + "grad_norm": 2.515625, + "learning_rate": 3.759773883344236e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.8752636462450027, + "num_tokens": 171666284.0, + "step": 1605 + }, + { + "epoch": 3.659064994298746, + "grad_norm": 5.625, + "learning_rate": 3.7581466849553685e-06, + "loss": 0.569, + "mean_token_accuracy": 0.884151503443718, + "num_tokens": 171774291.0, + "step": 1606 + }, + { + "epoch": 3.661345496009122, + "grad_norm": 2.5, + "learning_rate": 3.7565187724667324e-06, + "loss": 0.562, + "mean_token_accuracy": 0.8847729563713074, + "num_tokens": 171881709.0, + "step": 1607 + }, + { + "epoch": 3.6636259977194983, + "grad_norm": 2.53125, + "learning_rate": 3.7548901468022993e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8790787905454636, + "num_tokens": 171988700.0, + "step": 1608 + }, + { + "epoch": 3.6659064994298745, + "grad_norm": 3.5, + "learning_rate": 3.7532608088864444e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.876745343208313, + "num_tokens": 172095530.0, + "step": 1609 + }, + { + "epoch": 3.668187001140251, + "grad_norm": 3.0625, + "learning_rate": 3.75163075964395e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8810919225215912, + "num_tokens": 172202580.0, + "step": 1610 + }, + { + "epoch": 3.6704675028506273, + "grad_norm": 3.140625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.8769372403621674, + "num_tokens": 172310027.0, + "step": 1611 + }, + { + "epoch": 3.6727480045610035, + "grad_norm": 2.265625, + "learning_rate": 3.748368530880183e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8828428983688354, + "num_tokens": 172416427.0, + "step": 1612 + }, + { + "epoch": 3.6750285062713797, + "grad_norm": 2.6875, + "learning_rate": 3.7467363532104874e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8848859220743179, + "num_tokens": 172523861.0, + "step": 1613 + }, + { + "epoch": 3.677309007981756, + "grad_norm": 2.796875, + "learning_rate": 3.7451034679173082e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8770375549793243, + "num_tokens": 172630804.0, + "step": 1614 + }, + { + "epoch": 3.679589509692132, + "grad_norm": 2.5625, + "learning_rate": 3.7434698759274366e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.8780438005924225, + "num_tokens": 172738455.0, + "step": 1615 + }, + { + "epoch": 3.6818700114025087, + "grad_norm": 3.40625, + "learning_rate": 3.741835578168071e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8857946693897247, + "num_tokens": 172845911.0, + "step": 1616 + }, + { + "epoch": 3.684150513112885, + "grad_norm": 4.4375, + "learning_rate": 3.740200575566806e-06, + "loss": 0.599, + "mean_token_accuracy": 0.8763966113328934, + "num_tokens": 172952684.0, + "step": 1617 + }, + { + "epoch": 3.686431014823261, + "grad_norm": 2.515625, + "learning_rate": 3.7385648690516364e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.879105344414711, + "num_tokens": 173059713.0, + "step": 1618 + }, + { + "epoch": 3.6887115165336373, + "grad_norm": 3.953125, + "learning_rate": 3.7369284595509587e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.873798742890358, + "num_tokens": 173167102.0, + "step": 1619 + }, + { + "epoch": 3.690992018244014, + "grad_norm": 2.890625, + "learning_rate": 3.7352913479935672e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8798558115959167, + "num_tokens": 173274635.0, + "step": 1620 + }, + { + "epoch": 3.69327251995439, + "grad_norm": 2.765625, + "learning_rate": 3.7336535353086546e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8765471577644348, + "num_tokens": 173382044.0, + "step": 1621 + }, + { + "epoch": 3.6955530216647663, + "grad_norm": 2.921875, + "learning_rate": 3.7320150224258124e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.8792853504419327, + "num_tokens": 173489148.0, + "step": 1622 + }, + { + "epoch": 3.6978335233751425, + "grad_norm": 3.25, + "learning_rate": 3.7303758102750274e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8817913979291916, + "num_tokens": 173596404.0, + "step": 1623 + }, + { + "epoch": 3.7001140250855187, + "grad_norm": 2.75, + "learning_rate": 3.7287358997866872e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8805738687515259, + "num_tokens": 173703327.0, + "step": 1624 + }, + { + "epoch": 3.702394526795895, + "grad_norm": 3.25, + "learning_rate": 3.7270952918915715e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8786198794841766, + "num_tokens": 173809938.0, + "step": 1625 + }, + { + "epoch": 3.7046750285062715, + "grad_norm": 2.71875, + "learning_rate": 3.7254539875208577e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.8781810998916626, + "num_tokens": 173916969.0, + "step": 1626 + }, + { + "epoch": 3.7069555302166477, + "grad_norm": 2.703125, + "learning_rate": 3.7238119876061196e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8811867833137512, + "num_tokens": 174023879.0, + "step": 1627 + }, + { + "epoch": 3.709236031927024, + "grad_norm": 4.4375, + "learning_rate": 3.7221692930793234e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8845594227313995, + "num_tokens": 174130854.0, + "step": 1628 + }, + { + "epoch": 3.7115165336374, + "grad_norm": 2.71875, + "learning_rate": 3.7205259048728316e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8832680881023407, + "num_tokens": 174238556.0, + "step": 1629 + }, + { + "epoch": 3.7137970353477767, + "grad_norm": 2.515625, + "learning_rate": 3.718881823919399e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8850577622652054, + "num_tokens": 174346096.0, + "step": 1630 + }, + { + "epoch": 3.716077537058153, + "grad_norm": 3.28125, + "learning_rate": 3.717237051152175e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8809388726949692, + "num_tokens": 174453264.0, + "step": 1631 + }, + { + "epoch": 3.718358038768529, + "grad_norm": 3.8125, + "learning_rate": 3.7155915875047005e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8833019137382507, + "num_tokens": 174560352.0, + "step": 1632 + }, + { + "epoch": 3.7206385404789053, + "grad_norm": 4.0, + "learning_rate": 3.7139454339109082e-06, + "loss": 0.59, + "mean_token_accuracy": 0.8777986019849777, + "num_tokens": 174667183.0, + "step": 1633 + }, + { + "epoch": 3.7229190421892815, + "grad_norm": 5.375, + "learning_rate": 3.7122985913051242e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8856906592845917, + "num_tokens": 174774451.0, + "step": 1634 + }, + { + "epoch": 3.7251995438996577, + "grad_norm": 2.203125, + "learning_rate": 3.710651060622064e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8867538422346115, + "num_tokens": 174881357.0, + "step": 1635 + }, + { + "epoch": 3.7274800456100343, + "grad_norm": 3.671875, + "learning_rate": 3.7090028427968343e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.8716320544481277, + "num_tokens": 174987742.0, + "step": 1636 + }, + { + "epoch": 3.7297605473204105, + "grad_norm": 3.546875, + "learning_rate": 3.7073539387649316e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.8786651045084, + "num_tokens": 175094511.0, + "step": 1637 + }, + { + "epoch": 3.7320410490307867, + "grad_norm": 3.78125, + "learning_rate": 3.7057043494622423e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.8771418780088425, + "num_tokens": 175201546.0, + "step": 1638 + }, + { + "epoch": 3.734321550741163, + "grad_norm": 3.046875, + "learning_rate": 3.704054075825042e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8767105042934418, + "num_tokens": 175308332.0, + "step": 1639 + }, + { + "epoch": 3.7366020524515395, + "grad_norm": 2.8125, + "learning_rate": 3.702403118789992e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8864465802907944, + "num_tokens": 175415556.0, + "step": 1640 + }, + { + "epoch": 3.7388825541619157, + "grad_norm": 5.78125, + "learning_rate": 3.7007514792941462e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8830393701791763, + "num_tokens": 175522759.0, + "step": 1641 + }, + { + "epoch": 3.741163055872292, + "grad_norm": 3.03125, + "learning_rate": 3.6990991582749414e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.8837911188602448, + "num_tokens": 175630033.0, + "step": 1642 + }, + { + "epoch": 3.743443557582668, + "grad_norm": 3.125, + "learning_rate": 3.6974461566702048e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8770407438278198, + "num_tokens": 175737297.0, + "step": 1643 + }, + { + "epoch": 3.7457240592930443, + "grad_norm": 2.578125, + "learning_rate": 3.695792475418146e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8814648389816284, + "num_tokens": 175844521.0, + "step": 1644 + }, + { + "epoch": 3.7480045610034205, + "grad_norm": 6.34375, + "learning_rate": 3.6941381154573646e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.878902330994606, + "num_tokens": 175951433.0, + "step": 1645 + }, + { + "epoch": 3.750285062713797, + "grad_norm": 4.84375, + "learning_rate": 3.692483077726843e-06, + "loss": 0.57, + "mean_token_accuracy": 0.884130209684372, + "num_tokens": 176058602.0, + "step": 1646 + }, + { + "epoch": 3.7525655644241733, + "grad_norm": 4.03125, + "learning_rate": 3.6908273631659475e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.8837015181779861, + "num_tokens": 176165024.0, + "step": 1647 + }, + { + "epoch": 3.7548460661345495, + "grad_norm": 4.5, + "learning_rate": 3.689170972714431e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8826144188642502, + "num_tokens": 176272341.0, + "step": 1648 + }, + { + "epoch": 3.757126567844926, + "grad_norm": 2.78125, + "learning_rate": 3.6875139073124277e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8824542462825775, + "num_tokens": 176379289.0, + "step": 1649 + }, + { + "epoch": 3.7594070695553023, + "grad_norm": 4.375, + "learning_rate": 3.6858561679004567e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8805209249258041, + "num_tokens": 176486406.0, + "step": 1650 + }, + { + "epoch": 3.7616875712656785, + "grad_norm": 5.21875, + "learning_rate": 3.684197755419419e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8811310976743698, + "num_tokens": 176593647.0, + "step": 1651 + }, + { + "epoch": 3.7639680729760547, + "grad_norm": 6.21875, + "learning_rate": 3.6825386708105963e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8876552581787109, + "num_tokens": 176700583.0, + "step": 1652 + }, + { + "epoch": 3.766248574686431, + "grad_norm": 2.890625, + "learning_rate": 3.6808789150156545e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8768218755722046, + "num_tokens": 176807605.0, + "step": 1653 + }, + { + "epoch": 3.768529076396807, + "grad_norm": 3.453125, + "learning_rate": 3.679218488976638e-06, + "loss": 0.5967, + "mean_token_accuracy": 0.8734856247901917, + "num_tokens": 176914867.0, + "step": 1654 + }, + { + "epoch": 3.7708095781071833, + "grad_norm": 3.21875, + "learning_rate": 3.677557393635973e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.8815526366233826, + "num_tokens": 177021989.0, + "step": 1655 + }, + { + "epoch": 3.77309007981756, + "grad_norm": 3.21875, + "learning_rate": 3.6758956299364643e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8808587789535522, + "num_tokens": 177128640.0, + "step": 1656 + }, + { + "epoch": 3.775370581527936, + "grad_norm": 2.375, + "learning_rate": 3.674233198821299e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8834094703197479, + "num_tokens": 177236418.0, + "step": 1657 + }, + { + "epoch": 3.7776510832383123, + "grad_norm": 3.078125, + "learning_rate": 3.6725701012340387e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8827496618032455, + "num_tokens": 177343371.0, + "step": 1658 + }, + { + "epoch": 3.779931584948689, + "grad_norm": 2.25, + "learning_rate": 3.6709063381186267e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8798905909061432, + "num_tokens": 177450862.0, + "step": 1659 + }, + { + "epoch": 3.782212086659065, + "grad_norm": 2.953125, + "learning_rate": 3.6692419104193823e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.8762509971857071, + "num_tokens": 177557890.0, + "step": 1660 + }, + { + "epoch": 3.7844925883694414, + "grad_norm": 3.0, + "learning_rate": 3.6675768190810023e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8809296637773514, + "num_tokens": 177664846.0, + "step": 1661 + }, + { + "epoch": 3.7867730900798175, + "grad_norm": 2.578125, + "learning_rate": 3.665911065048561e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8829169422388077, + "num_tokens": 177772527.0, + "step": 1662 + }, + { + "epoch": 3.7890535917901937, + "grad_norm": 5.25, + "learning_rate": 3.6642446492675075e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8831394165754318, + "num_tokens": 177879427.0, + "step": 1663 + }, + { + "epoch": 3.79133409350057, + "grad_norm": 2.96875, + "learning_rate": 3.6625775726836677e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8788637816905975, + "num_tokens": 177986497.0, + "step": 1664 + }, + { + "epoch": 3.7936145952109466, + "grad_norm": 3.046875, + "learning_rate": 3.6609098362432425e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8837129175662994, + "num_tokens": 178093286.0, + "step": 1665 + }, + { + "epoch": 3.7958950969213228, + "grad_norm": 4.0, + "learning_rate": 3.659241440892806e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8812308311462402, + "num_tokens": 178200373.0, + "step": 1666 + }, + { + "epoch": 3.798175598631699, + "grad_norm": 4.1875, + "learning_rate": 3.6575723875793085e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8776303678750992, + "num_tokens": 178307338.0, + "step": 1667 + }, + { + "epoch": 3.800456100342075, + "grad_norm": 6.3125, + "learning_rate": 3.655902677250071e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8835581988096237, + "num_tokens": 178414549.0, + "step": 1668 + }, + { + "epoch": 3.802736602052452, + "grad_norm": 3.578125, + "learning_rate": 3.6542323108527896e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8782322406768799, + "num_tokens": 178521662.0, + "step": 1669 + }, + { + "epoch": 3.805017103762828, + "grad_norm": 2.453125, + "learning_rate": 3.652561289335532e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8794067353010178, + "num_tokens": 178627910.0, + "step": 1670 + }, + { + "epoch": 3.807297605473204, + "grad_norm": 3.765625, + "learning_rate": 3.6508896136467376e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8825301080942154, + "num_tokens": 178735254.0, + "step": 1671 + }, + { + "epoch": 3.8095781071835804, + "grad_norm": 3.0, + "learning_rate": 3.649217284735217e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8833526968955994, + "num_tokens": 178842561.0, + "step": 1672 + }, + { + "epoch": 3.8118586088939566, + "grad_norm": 4.59375, + "learning_rate": 3.6475443035501522e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8792631328105927, + "num_tokens": 178949352.0, + "step": 1673 + }, + { + "epoch": 3.8141391106043327, + "grad_norm": 3.90625, + "learning_rate": 3.645870671041095e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.8842526823282242, + "num_tokens": 179056304.0, + "step": 1674 + }, + { + "epoch": 3.8164196123147094, + "grad_norm": 4.15625, + "learning_rate": 3.6441963881579668e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8808672279119492, + "num_tokens": 179163897.0, + "step": 1675 + }, + { + "epoch": 3.8187001140250856, + "grad_norm": 4.75, + "learning_rate": 3.642521455851058e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8823509216308594, + "num_tokens": 179270806.0, + "step": 1676 + }, + { + "epoch": 3.8209806157354618, + "grad_norm": 3.3125, + "learning_rate": 3.6408458750710284e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8836124241352081, + "num_tokens": 179378081.0, + "step": 1677 + }, + { + "epoch": 3.823261117445838, + "grad_norm": 4.625, + "learning_rate": 3.639169646768905e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8754040449857712, + "num_tokens": 179485558.0, + "step": 1678 + }, + { + "epoch": 3.8255416191562146, + "grad_norm": 5.21875, + "learning_rate": 3.637492771896082e-06, + "loss": 0.5817, + "mean_token_accuracy": 0.8810373991727829, + "num_tokens": 179592323.0, + "step": 1679 + }, + { + "epoch": 3.827822120866591, + "grad_norm": 3.0625, + "learning_rate": 3.6358152514043226e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8800250738859177, + "num_tokens": 179699420.0, + "step": 1680 + }, + { + "epoch": 3.830102622576967, + "grad_norm": 3.90625, + "learning_rate": 3.634137086245754e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8808536231517792, + "num_tokens": 179806666.0, + "step": 1681 + }, + { + "epoch": 3.832383124287343, + "grad_norm": 4.28125, + "learning_rate": 3.6324582773728712e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8750377893447876, + "num_tokens": 179914021.0, + "step": 1682 + }, + { + "epoch": 3.8346636259977194, + "grad_norm": 3.078125, + "learning_rate": 3.6307788257385325e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.8790106326341629, + "num_tokens": 180020983.0, + "step": 1683 + }, + { + "epoch": 3.8369441277080956, + "grad_norm": 2.59375, + "learning_rate": 3.6290987322959624e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.8865607380867004, + "num_tokens": 180128755.0, + "step": 1684 + }, + { + "epoch": 3.839224629418472, + "grad_norm": 2.375, + "learning_rate": 3.6274179979987507e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.881371021270752, + "num_tokens": 180235525.0, + "step": 1685 + }, + { + "epoch": 3.8415051311288484, + "grad_norm": 2.890625, + "learning_rate": 3.625736623800849e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.878914400935173, + "num_tokens": 180342761.0, + "step": 1686 + }, + { + "epoch": 3.8437856328392246, + "grad_norm": 2.828125, + "learning_rate": 3.624054610656572e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.879774883389473, + "num_tokens": 180450260.0, + "step": 1687 + }, + { + "epoch": 3.846066134549601, + "grad_norm": 3.34375, + "learning_rate": 3.622371959520599e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.881365180015564, + "num_tokens": 180557292.0, + "step": 1688 + }, + { + "epoch": 3.8483466362599774, + "grad_norm": 2.734375, + "learning_rate": 3.6206886713479705e-06, + "loss": 0.6012, + "mean_token_accuracy": 0.8755958080291748, + "num_tokens": 180664350.0, + "step": 1689 + }, + { + "epoch": 3.8506271379703536, + "grad_norm": 2.859375, + "learning_rate": 3.6190047470940875e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8807881772518158, + "num_tokens": 180771157.0, + "step": 1690 + }, + { + "epoch": 3.85290763968073, + "grad_norm": 2.875, + "learning_rate": 3.6173201877147134e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8763711750507355, + "num_tokens": 180877774.0, + "step": 1691 + }, + { + "epoch": 3.855188141391106, + "grad_norm": 3.90625, + "learning_rate": 3.6156349941659717e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8838952630758286, + "num_tokens": 180985055.0, + "step": 1692 + }, + { + "epoch": 3.857468643101482, + "grad_norm": 4.53125, + "learning_rate": 3.613949167404345e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8797314018011093, + "num_tokens": 181091651.0, + "step": 1693 + }, + { + "epoch": 3.8597491448118584, + "grad_norm": 4.3125, + "learning_rate": 3.6122627083866773e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8829943537712097, + "num_tokens": 181198250.0, + "step": 1694 + }, + { + "epoch": 3.862029646522235, + "grad_norm": 3.578125, + "learning_rate": 3.610575618070169e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8832627683877945, + "num_tokens": 181305182.0, + "step": 1695 + }, + { + "epoch": 3.864310148232611, + "grad_norm": 3.609375, + "learning_rate": 3.6088878974123796e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8848606944084167, + "num_tokens": 181412694.0, + "step": 1696 + }, + { + "epoch": 3.8665906499429874, + "grad_norm": 5.125, + "learning_rate": 3.6071995473712284e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8789099603891373, + "num_tokens": 181519464.0, + "step": 1697 + }, + { + "epoch": 3.8688711516533636, + "grad_norm": 3.640625, + "learning_rate": 3.605510568904989e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.8752625435590744, + "num_tokens": 181626561.0, + "step": 1698 + }, + { + "epoch": 3.8711516533637402, + "grad_norm": 4.90625, + "learning_rate": 3.6038209629722936e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8840775638818741, + "num_tokens": 181733816.0, + "step": 1699 + }, + { + "epoch": 3.8734321550741164, + "grad_norm": 2.609375, + "learning_rate": 3.6021307305321295e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8823692500591278, + "num_tokens": 181841269.0, + "step": 1700 + }, + { + "epoch": 3.8757126567844926, + "grad_norm": 4.40625, + "learning_rate": 3.6004398725438406e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8876504898071289, + "num_tokens": 181948467.0, + "step": 1701 + }, + { + "epoch": 3.877993158494869, + "grad_norm": 4.8125, + "learning_rate": 3.5987483899671245e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8821342885494232, + "num_tokens": 182055789.0, + "step": 1702 + }, + { + "epoch": 3.880273660205245, + "grad_norm": 4.5625, + "learning_rate": 3.597056283762034e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8786395937204361, + "num_tokens": 182163625.0, + "step": 1703 + }, + { + "epoch": 3.882554161915621, + "grad_norm": 2.828125, + "learning_rate": 3.5953635548889777e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8824369460344315, + "num_tokens": 182270282.0, + "step": 1704 + }, + { + "epoch": 3.884834663625998, + "grad_norm": 2.796875, + "learning_rate": 3.5936702043087134e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.8777669966220856, + "num_tokens": 182376966.0, + "step": 1705 + }, + { + "epoch": 3.887115165336374, + "grad_norm": 2.75, + "learning_rate": 3.5919762329823556e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8837083727121353, + "num_tokens": 182484405.0, + "step": 1706 + }, + { + "epoch": 3.88939566704675, + "grad_norm": 4.125, + "learning_rate": 3.5902816418713694e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8821840286254883, + "num_tokens": 182591485.0, + "step": 1707 + }, + { + "epoch": 3.8916761687571264, + "grad_norm": 8.375, + "learning_rate": 3.5885864319375717e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8777542114257812, + "num_tokens": 182699148.0, + "step": 1708 + }, + { + "epoch": 3.893956670467503, + "grad_norm": 4.15625, + "learning_rate": 3.5868906041431313e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8843749016523361, + "num_tokens": 182805780.0, + "step": 1709 + }, + { + "epoch": 3.8962371721778792, + "grad_norm": 4.125, + "learning_rate": 3.5851941594505674e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8774453997612, + "num_tokens": 182912492.0, + "step": 1710 + }, + { + "epoch": 3.8985176738882554, + "grad_norm": 2.390625, + "learning_rate": 3.5834970988227484e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8813342452049255, + "num_tokens": 183019581.0, + "step": 1711 + }, + { + "epoch": 3.9007981755986316, + "grad_norm": 5.125, + "learning_rate": 3.581799423222895e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8827546834945679, + "num_tokens": 183126811.0, + "step": 1712 + }, + { + "epoch": 3.903078677309008, + "grad_norm": 8.5, + "learning_rate": 3.580101133614573e-06, + "loss": 0.583, + "mean_token_accuracy": 0.879145547747612, + "num_tokens": 183234837.0, + "step": 1713 + }, + { + "epoch": 3.905359179019384, + "grad_norm": 3.578125, + "learning_rate": 3.5784022309617006e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8833577632904053, + "num_tokens": 183342206.0, + "step": 1714 + }, + { + "epoch": 3.9076396807297606, + "grad_norm": 4.0625, + "learning_rate": 3.57670271622854e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8815594464540482, + "num_tokens": 183448841.0, + "step": 1715 + }, + { + "epoch": 3.909920182440137, + "grad_norm": 2.546875, + "learning_rate": 3.5750025903797053e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.8809472620487213, + "num_tokens": 183555198.0, + "step": 1716 + }, + { + "epoch": 3.912200684150513, + "grad_norm": 3.34375, + "learning_rate": 3.5733018543801534e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.8786370903253555, + "num_tokens": 183661758.0, + "step": 1717 + }, + { + "epoch": 3.9144811858608897, + "grad_norm": 6.25, + "learning_rate": 3.5716005091951906e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.8804649114608765, + "num_tokens": 183768385.0, + "step": 1718 + }, + { + "epoch": 3.916761687571266, + "grad_norm": 5.34375, + "learning_rate": 3.569898555790466e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8821226358413696, + "num_tokens": 183874915.0, + "step": 1719 + }, + { + "epoch": 3.919042189281642, + "grad_norm": 3.921875, + "learning_rate": 3.5681959951319766e-06, + "loss": 0.6009, + "mean_token_accuracy": 0.8761325925588608, + "num_tokens": 183982390.0, + "step": 1720 + }, + { + "epoch": 3.9213226909920182, + "grad_norm": 2.796875, + "learning_rate": 3.566492828186063e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8820738047361374, + "num_tokens": 184089244.0, + "step": 1721 + }, + { + "epoch": 3.9236031927023944, + "grad_norm": 3.515625, + "learning_rate": 3.564789055919409e-06, + "loss": 0.5631, + "mean_token_accuracy": 0.8825008720159531, + "num_tokens": 184196515.0, + "step": 1722 + }, + { + "epoch": 3.9258836944127706, + "grad_norm": 2.984375, + "learning_rate": 3.5630846792990435e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.8774504512548447, + "num_tokens": 184303445.0, + "step": 1723 + }, + { + "epoch": 3.928164196123147, + "grad_norm": 3.859375, + "learning_rate": 3.5613796992923382e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8825356364250183, + "num_tokens": 184410680.0, + "step": 1724 + }, + { + "epoch": 3.9304446978335235, + "grad_norm": 2.90625, + "learning_rate": 3.559674116867006e-06, + "loss": 0.577, + "mean_token_accuracy": 0.881066232919693, + "num_tokens": 184517883.0, + "step": 1725 + }, + { + "epoch": 3.9327251995438997, + "grad_norm": 2.390625, + "learning_rate": 3.5579679329911025e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.8830063939094543, + "num_tokens": 184625069.0, + "step": 1726 + }, + { + "epoch": 3.935005701254276, + "grad_norm": 3.234375, + "learning_rate": 3.556261148633026e-06, + "loss": 0.576, + "mean_token_accuracy": 0.8830278366804123, + "num_tokens": 184732358.0, + "step": 1727 + }, + { + "epoch": 3.9372862029646525, + "grad_norm": 3.34375, + "learning_rate": 3.5545537647615125e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8799594938755035, + "num_tokens": 184839208.0, + "step": 1728 + }, + { + "epoch": 3.9395667046750287, + "grad_norm": 2.671875, + "learning_rate": 3.552845782345642e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8789944648742676, + "num_tokens": 184945933.0, + "step": 1729 + }, + { + "epoch": 3.941847206385405, + "grad_norm": 2.8125, + "learning_rate": 3.551137202354831e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8809881657361984, + "num_tokens": 185052752.0, + "step": 1730 + }, + { + "epoch": 3.944127708095781, + "grad_norm": 2.84375, + "learning_rate": 3.5494280257588367e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8846791535615921, + "num_tokens": 185159970.0, + "step": 1731 + }, + { + "epoch": 3.9464082098061573, + "grad_norm": 3.515625, + "learning_rate": 3.547718253527755e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.8843375146389008, + "num_tokens": 185267250.0, + "step": 1732 + }, + { + "epoch": 3.9486887115165334, + "grad_norm": 2.4375, + "learning_rate": 3.546007886632019e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8792973756790161, + "num_tokens": 185373715.0, + "step": 1733 + }, + { + "epoch": 3.95096921322691, + "grad_norm": 2.796875, + "learning_rate": 3.5442969260424022e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8815480768680573, + "num_tokens": 185480906.0, + "step": 1734 + }, + { + "epoch": 3.9532497149372863, + "grad_norm": 3.109375, + "learning_rate": 3.5425853727300095e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8790358453989029, + "num_tokens": 185588351.0, + "step": 1735 + }, + { + "epoch": 3.9555302166476625, + "grad_norm": 2.8125, + "learning_rate": 3.5408732276662882e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8829907327890396, + "num_tokens": 185695047.0, + "step": 1736 + }, + { + "epoch": 3.9578107183580387, + "grad_norm": 3.59375, + "learning_rate": 3.5391604918230173e-06, + "loss": 0.582, + "mean_token_accuracy": 0.878580242395401, + "num_tokens": 185802784.0, + "step": 1737 + }, + { + "epoch": 3.9600912200684153, + "grad_norm": 2.765625, + "learning_rate": 3.537447166172313e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.879580944776535, + "num_tokens": 185909720.0, + "step": 1738 + }, + { + "epoch": 3.9623717217787915, + "grad_norm": 2.765625, + "learning_rate": 3.5357332516866256e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8843010365962982, + "num_tokens": 186016443.0, + "step": 1739 + }, + { + "epoch": 3.9646522234891677, + "grad_norm": 3.375, + "learning_rate": 3.534018749338741e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.8813291490077972, + "num_tokens": 186123915.0, + "step": 1740 + }, + { + "epoch": 3.966932725199544, + "grad_norm": 2.8125, + "learning_rate": 3.532303660101776e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8835762590169907, + "num_tokens": 186230831.0, + "step": 1741 + }, + { + "epoch": 3.96921322690992, + "grad_norm": 4.15625, + "learning_rate": 3.530587984949183e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.8778104931116104, + "num_tokens": 186337578.0, + "step": 1742 + }, + { + "epoch": 3.9714937286202963, + "grad_norm": 3.125, + "learning_rate": 3.5288717248547453e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8861571699380875, + "num_tokens": 186445605.0, + "step": 1743 + }, + { + "epoch": 3.973774230330673, + "grad_norm": 3.65625, + "learning_rate": 3.5271548807925803e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8777986913919449, + "num_tokens": 186552754.0, + "step": 1744 + }, + { + "epoch": 3.976054732041049, + "grad_norm": 2.90625, + "learning_rate": 3.525437453737136e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8793213814496994, + "num_tokens": 186659906.0, + "step": 1745 + }, + { + "epoch": 3.9783352337514253, + "grad_norm": 3.15625, + "learning_rate": 3.5237194446631883e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8833697587251663, + "num_tokens": 186766193.0, + "step": 1746 + }, + { + "epoch": 3.9806157354618015, + "grad_norm": 2.59375, + "learning_rate": 3.522000854545849e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8789832890033722, + "num_tokens": 186872982.0, + "step": 1747 + }, + { + "epoch": 3.982896237172178, + "grad_norm": 2.78125, + "learning_rate": 3.520281684360554e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.8788421899080276, + "num_tokens": 186979970.0, + "step": 1748 + }, + { + "epoch": 3.9851767388825543, + "grad_norm": 2.875, + "learning_rate": 3.5185619350830725e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8799102157354355, + "num_tokens": 187086646.0, + "step": 1749 + }, + { + "epoch": 3.9874572405929305, + "grad_norm": 2.296875, + "learning_rate": 3.516841607689501e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.8887099474668503, + "num_tokens": 187194165.0, + "step": 1750 + }, + { + "epoch": 3.9897377423033067, + "grad_norm": 3.078125, + "learning_rate": 3.515120703156264e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8788872063159943, + "num_tokens": 187300450.0, + "step": 1751 + }, + { + "epoch": 3.992018244013683, + "grad_norm": 2.84375, + "learning_rate": 3.5133992224601126e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8765320926904678, + "num_tokens": 187407316.0, + "step": 1752 + }, + { + "epoch": 3.994298745724059, + "grad_norm": 4.46875, + "learning_rate": 3.511677166578128e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.8845993727445602, + "num_tokens": 187515138.0, + "step": 1753 + }, + { + "epoch": 3.9965792474344357, + "grad_norm": 2.328125, + "learning_rate": 3.509954536487714e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.8815711736679077, + "num_tokens": 187622052.0, + "step": 1754 + }, + { + "epoch": 3.998859749144812, + "grad_norm": 4.40625, + "learning_rate": 3.5082313331666035e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8849412500858307, + "num_tokens": 187729710.0, + "step": 1755 + }, + { + "epoch": 4.0, + "grad_norm": 4.90625, + "learning_rate": 3.506507557592853e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.879196435213089, + "num_tokens": 187768928.0, + "step": 1756 + }, + { + "epoch": 4.002280501710376, + "grad_norm": 3.0, + "learning_rate": 3.5047832107448437e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.8787702172994614, + "num_tokens": 187875429.0, + "step": 1757 + }, + { + "epoch": 4.004561003420752, + "grad_norm": 2.703125, + "learning_rate": 3.503058293601283e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8796392232179642, + "num_tokens": 187982859.0, + "step": 1758 + }, + { + "epoch": 4.006841505131129, + "grad_norm": 2.921875, + "learning_rate": 3.5013328071411995e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8837246298789978, + "num_tokens": 188090247.0, + "step": 1759 + }, + { + "epoch": 4.009122006841505, + "grad_norm": 2.640625, + "learning_rate": 3.499606752343945e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8794354498386383, + "num_tokens": 188198071.0, + "step": 1760 + }, + { + "epoch": 4.009122006841505, + "eval_loss": 0.5914322733879089, + "eval_mean_token_accuracy": 0.8791430533612182, + "eval_num_tokens": 188198071.0, + "eval_runtime": 58.7059, + "eval_samples_per_second": 142.831, + "eval_steps_per_second": 4.48, + "step": 1760 + }, + { + "epoch": 4.011402508551882, + "grad_norm": 2.765625, + "learning_rate": 3.4978801301891972e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8818523436784744, + "num_tokens": 188305205.0, + "step": 1761 + }, + { + "epoch": 4.013683010262258, + "grad_norm": 3.625, + "learning_rate": 3.496152941656952e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8844558596611023, + "num_tokens": 188411935.0, + "step": 1762 + }, + { + "epoch": 4.015963511972634, + "grad_norm": 3.453125, + "learning_rate": 3.494425187727528e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8814276456832886, + "num_tokens": 188519090.0, + "step": 1763 + }, + { + "epoch": 4.01824401368301, + "grad_norm": 3.609375, + "learning_rate": 3.4926968693815667e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8852830976247787, + "num_tokens": 188626017.0, + "step": 1764 + }, + { + "epoch": 4.020524515393387, + "grad_norm": 2.984375, + "learning_rate": 3.4909679876000256e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8862181156873703, + "num_tokens": 188733308.0, + "step": 1765 + }, + { + "epoch": 4.022805017103763, + "grad_norm": 4.4375, + "learning_rate": 3.4892385433641875e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8816091120243073, + "num_tokens": 188840326.0, + "step": 1766 + }, + { + "epoch": 4.025085518814139, + "grad_norm": 2.921875, + "learning_rate": 3.4875085376556493e-06, + "loss": 0.5696, + "mean_token_accuracy": 0.881555825471878, + "num_tokens": 188947739.0, + "step": 1767 + }, + { + "epoch": 4.027366020524515, + "grad_norm": 3.109375, + "learning_rate": 3.4857779714563305e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.8749964982271194, + "num_tokens": 189054532.0, + "step": 1768 + }, + { + "epoch": 4.029646522234891, + "grad_norm": 2.53125, + "learning_rate": 3.4840468457484654e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8818257004022598, + "num_tokens": 189161911.0, + "step": 1769 + }, + { + "epoch": 4.031927023945268, + "grad_norm": 2.515625, + "learning_rate": 3.4823151615146093e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.8840532153844833, + "num_tokens": 189269010.0, + "step": 1770 + }, + { + "epoch": 4.034207525655645, + "grad_norm": 3.484375, + "learning_rate": 3.480582919737631e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8821255415678024, + "num_tokens": 189375921.0, + "step": 1771 + }, + { + "epoch": 4.036488027366021, + "grad_norm": 4.875, + "learning_rate": 3.478850121400719e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8794375658035278, + "num_tokens": 189483390.0, + "step": 1772 + }, + { + "epoch": 4.038768529076397, + "grad_norm": 3.265625, + "learning_rate": 3.477116767487375e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8811006546020508, + "num_tokens": 189590557.0, + "step": 1773 + }, + { + "epoch": 4.041049030786773, + "grad_norm": 2.734375, + "learning_rate": 3.475382858981418e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.8820205479860306, + "num_tokens": 189697618.0, + "step": 1774 + }, + { + "epoch": 4.043329532497149, + "grad_norm": 7.40625, + "learning_rate": 3.473648396866981e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8759682923555374, + "num_tokens": 189804472.0, + "step": 1775 + }, + { + "epoch": 4.045610034207526, + "grad_norm": 4.21875, + "learning_rate": 3.4719133821285108e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8793429881334305, + "num_tokens": 189911753.0, + "step": 1776 + }, + { + "epoch": 4.047890535917902, + "grad_norm": 4.78125, + "learning_rate": 3.470177815750769e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8850299268960953, + "num_tokens": 190019669.0, + "step": 1777 + }, + { + "epoch": 4.050171037628278, + "grad_norm": 3.734375, + "learning_rate": 3.4684416987188273e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.8823755383491516, + "num_tokens": 190127195.0, + "step": 1778 + }, + { + "epoch": 4.052451539338654, + "grad_norm": 3.234375, + "learning_rate": 3.4667050320180755e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8784758597612381, + "num_tokens": 190234158.0, + "step": 1779 + }, + { + "epoch": 4.05473204104903, + "grad_norm": 2.59375, + "learning_rate": 3.4649678166342104e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8827002048492432, + "num_tokens": 190341000.0, + "step": 1780 + }, + { + "epoch": 4.0570125427594075, + "grad_norm": 2.671875, + "learning_rate": 3.4632300535532415e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8756804317235947, + "num_tokens": 190448392.0, + "step": 1781 + }, + { + "epoch": 4.059293044469784, + "grad_norm": 4.4375, + "learning_rate": 3.46149174376149e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8806837797164917, + "num_tokens": 190555060.0, + "step": 1782 + }, + { + "epoch": 4.06157354618016, + "grad_norm": 5.09375, + "learning_rate": 3.459752888245587e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8827811032533646, + "num_tokens": 190662364.0, + "step": 1783 + }, + { + "epoch": 4.063854047890536, + "grad_norm": 2.546875, + "learning_rate": 3.4580134879924732e-06, + "loss": 0.584, + "mean_token_accuracy": 0.877520278096199, + "num_tokens": 190769329.0, + "step": 1784 + }, + { + "epoch": 4.066134549600912, + "grad_norm": 3.0, + "learning_rate": 3.4562735439894e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.88214111328125, + "num_tokens": 190876871.0, + "step": 1785 + }, + { + "epoch": 4.068415051311288, + "grad_norm": 3.890625, + "learning_rate": 3.4545330572239234e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.8759456723928452, + "num_tokens": 190983692.0, + "step": 1786 + }, + { + "epoch": 4.070695553021665, + "grad_norm": 4.65625, + "learning_rate": 3.452792028683912e-06, + "loss": 0.59, + "mean_token_accuracy": 0.8788794428110123, + "num_tokens": 191090686.0, + "step": 1787 + }, + { + "epoch": 4.072976054732041, + "grad_norm": 2.71875, + "learning_rate": 3.4510504593575396e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8841027617454529, + "num_tokens": 191197679.0, + "step": 1788 + }, + { + "epoch": 4.075256556442417, + "grad_norm": 3.96875, + "learning_rate": 3.449308350233287e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8803805410861969, + "num_tokens": 191305286.0, + "step": 1789 + }, + { + "epoch": 4.077537058152793, + "grad_norm": 2.765625, + "learning_rate": 3.447565702299942e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8774025738239288, + "num_tokens": 191412336.0, + "step": 1790 + }, + { + "epoch": 4.07981755986317, + "grad_norm": 2.859375, + "learning_rate": 3.445822516546598e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.881226509809494, + "num_tokens": 191519740.0, + "step": 1791 + }, + { + "epoch": 4.0820980615735465, + "grad_norm": 2.5625, + "learning_rate": 3.444078793962653e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8817672431468964, + "num_tokens": 191626597.0, + "step": 1792 + }, + { + "epoch": 4.084378563283923, + "grad_norm": 2.328125, + "learning_rate": 3.4423345355378114e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8822430968284607, + "num_tokens": 191733873.0, + "step": 1793 + }, + { + "epoch": 4.086659064994299, + "grad_norm": 4.625, + "learning_rate": 3.440589742262079e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8803035467863083, + "num_tokens": 191840789.0, + "step": 1794 + }, + { + "epoch": 4.088939566704675, + "grad_norm": 5.25, + "learning_rate": 3.438844415125768e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8835125714540482, + "num_tokens": 191947897.0, + "step": 1795 + }, + { + "epoch": 4.091220068415051, + "grad_norm": 2.4375, + "learning_rate": 3.437098555119493e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.8822659403085709, + "num_tokens": 192055072.0, + "step": 1796 + }, + { + "epoch": 4.0935005701254275, + "grad_norm": 2.890625, + "learning_rate": 3.4353521632341686e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8791303038597107, + "num_tokens": 192162000.0, + "step": 1797 + }, + { + "epoch": 4.095781071835804, + "grad_norm": 7.75, + "learning_rate": 3.4336052404610138e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8811719119548798, + "num_tokens": 192269139.0, + "step": 1798 + }, + { + "epoch": 4.09806157354618, + "grad_norm": 8.6875, + "learning_rate": 3.431857787791549e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8792611956596375, + "num_tokens": 192376137.0, + "step": 1799 + }, + { + "epoch": 4.100342075256556, + "grad_norm": 6.96875, + "learning_rate": 3.4301098062175936e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8786915689706802, + "num_tokens": 192483573.0, + "step": 1800 + }, + { + "epoch": 4.102622576966933, + "grad_norm": 3.4375, + "learning_rate": 3.4283612967312692e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8749962151050568, + "num_tokens": 192590098.0, + "step": 1801 + }, + { + "epoch": 4.104903078677309, + "grad_norm": 3.21875, + "learning_rate": 3.426612260324996e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8811412006616592, + "num_tokens": 192696798.0, + "step": 1802 + }, + { + "epoch": 4.1071835803876855, + "grad_norm": 4.84375, + "learning_rate": 3.424862697991491e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8869053721427917, + "num_tokens": 192804017.0, + "step": 1803 + }, + { + "epoch": 4.109464082098062, + "grad_norm": 4.21875, + "learning_rate": 3.4231126107237754e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8787698745727539, + "num_tokens": 192912495.0, + "step": 1804 + }, + { + "epoch": 4.111744583808438, + "grad_norm": 3.1875, + "learning_rate": 3.4213619995151628e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.8815167546272278, + "num_tokens": 193019325.0, + "step": 1805 + }, + { + "epoch": 4.114025085518814, + "grad_norm": 3.15625, + "learning_rate": 3.4196108653592662e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.8847940266132355, + "num_tokens": 193126418.0, + "step": 1806 + }, + { + "epoch": 4.11630558722919, + "grad_norm": 2.453125, + "learning_rate": 3.417859209249997e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.8841525167226791, + "num_tokens": 193233113.0, + "step": 1807 + }, + { + "epoch": 4.1185860889395665, + "grad_norm": 3.515625, + "learning_rate": 3.4161070321815605e-06, + "loss": 0.5985, + "mean_token_accuracy": 0.8764221370220184, + "num_tokens": 193339988.0, + "step": 1808 + }, + { + "epoch": 4.120866590649943, + "grad_norm": 3.859375, + "learning_rate": 3.4143543351484585e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8799712508916855, + "num_tokens": 193446945.0, + "step": 1809 + }, + { + "epoch": 4.123147092360319, + "grad_norm": 3.8125, + "learning_rate": 3.4126011191454877e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8829108774662018, + "num_tokens": 193553550.0, + "step": 1810 + }, + { + "epoch": 4.125427594070696, + "grad_norm": 5.15625, + "learning_rate": 3.4108473851677408e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8795969039201736, + "num_tokens": 193660308.0, + "step": 1811 + }, + { + "epoch": 4.127708095781072, + "grad_norm": 3.375, + "learning_rate": 3.4090931342106024e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8809874951839447, + "num_tokens": 193768000.0, + "step": 1812 + }, + { + "epoch": 4.129988597491448, + "grad_norm": 3.078125, + "learning_rate": 3.4073383672697524e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.8771644234657288, + "num_tokens": 193875084.0, + "step": 1813 + }, + { + "epoch": 4.1322690992018245, + "grad_norm": 4.8125, + "learning_rate": 3.4055830853411616e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.8745006024837494, + "num_tokens": 193981792.0, + "step": 1814 + }, + { + "epoch": 4.134549600912201, + "grad_norm": 2.734375, + "learning_rate": 3.4038272894210945e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.8784460872411728, + "num_tokens": 194088451.0, + "step": 1815 + }, + { + "epoch": 4.136830102622577, + "grad_norm": 3.21875, + "learning_rate": 3.4020709805061066e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8836447149515152, + "num_tokens": 194195882.0, + "step": 1816 + }, + { + "epoch": 4.139110604332953, + "grad_norm": 2.703125, + "learning_rate": 3.4003141595930456e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8817628026008606, + "num_tokens": 194302902.0, + "step": 1817 + }, + { + "epoch": 4.141391106043329, + "grad_norm": 2.78125, + "learning_rate": 3.3985568276790487e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.8798153549432755, + "num_tokens": 194410857.0, + "step": 1818 + }, + { + "epoch": 4.1436716077537055, + "grad_norm": 3.734375, + "learning_rate": 3.3967989857615434e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8804911673069, + "num_tokens": 194517991.0, + "step": 1819 + }, + { + "epoch": 4.145952109464082, + "grad_norm": 4.09375, + "learning_rate": 3.3950406348382483e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8826321363449097, + "num_tokens": 194624664.0, + "step": 1820 + }, + { + "epoch": 4.148232611174459, + "grad_norm": 4.375, + "learning_rate": 3.3932817759071666e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.8814938217401505, + "num_tokens": 194731631.0, + "step": 1821 + }, + { + "epoch": 4.150513112884835, + "grad_norm": 2.65625, + "learning_rate": 3.3915224099665962e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8795495629310608, + "num_tokens": 194838909.0, + "step": 1822 + }, + { + "epoch": 4.152793614595211, + "grad_norm": 4.15625, + "learning_rate": 3.389762538015116e-06, + "loss": 0.5996, + "mean_token_accuracy": 0.8770654946565628, + "num_tokens": 194945773.0, + "step": 1823 + }, + { + "epoch": 4.155074116305587, + "grad_norm": 2.65625, + "learning_rate": 3.388002161051598e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8804232776165009, + "num_tokens": 195052653.0, + "step": 1824 + }, + { + "epoch": 4.1573546180159635, + "grad_norm": 4.625, + "learning_rate": 3.3862412800751963e-06, + "loss": 0.595, + "mean_token_accuracy": 0.8783848881721497, + "num_tokens": 195159206.0, + "step": 1825 + }, + { + "epoch": 4.15963511972634, + "grad_norm": 5.15625, + "learning_rate": 3.3844798960853533e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.879721149802208, + "num_tokens": 195266220.0, + "step": 1826 + }, + { + "epoch": 4.161915621436716, + "grad_norm": 3.734375, + "learning_rate": 3.382718010081797e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8808034509420395, + "num_tokens": 195373432.0, + "step": 1827 + }, + { + "epoch": 4.164196123147092, + "grad_norm": 3.40625, + "learning_rate": 3.38095562306454e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8810047507286072, + "num_tokens": 195480280.0, + "step": 1828 + }, + { + "epoch": 4.166476624857468, + "grad_norm": 2.28125, + "learning_rate": 3.3791927360338785e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8826696872711182, + "num_tokens": 195587648.0, + "step": 1829 + }, + { + "epoch": 4.168757126567845, + "grad_norm": 3.203125, + "learning_rate": 3.3774293499903934e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8804333359003067, + "num_tokens": 195694201.0, + "step": 1830 + }, + { + "epoch": 4.1710376282782216, + "grad_norm": 4.53125, + "learning_rate": 3.3756654659349487e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.879314973950386, + "num_tokens": 195800840.0, + "step": 1831 + }, + { + "epoch": 4.173318129988598, + "grad_norm": 3.859375, + "learning_rate": 3.373901084868691e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.8824407607316971, + "num_tokens": 195908050.0, + "step": 1832 + }, + { + "epoch": 4.175598631698974, + "grad_norm": 4.4375, + "learning_rate": 3.372136207793049e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8842916339635849, + "num_tokens": 196014977.0, + "step": 1833 + }, + { + "epoch": 4.17787913340935, + "grad_norm": 2.671875, + "learning_rate": 3.3703708357097333e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8835365027189255, + "num_tokens": 196122033.0, + "step": 1834 + }, + { + "epoch": 4.180159635119726, + "grad_norm": 5.25, + "learning_rate": 3.3686049696207336e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8810452073812485, + "num_tokens": 196228948.0, + "step": 1835 + }, + { + "epoch": 4.1824401368301025, + "grad_norm": 4.90625, + "learning_rate": 3.3668386105283226e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.882371187210083, + "num_tokens": 196335775.0, + "step": 1836 + }, + { + "epoch": 4.184720638540479, + "grad_norm": 4.03125, + "learning_rate": 3.365071759435051e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8814999759197235, + "num_tokens": 196442967.0, + "step": 1837 + }, + { + "epoch": 4.187001140250855, + "grad_norm": 2.828125, + "learning_rate": 3.363304417343749e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8800509870052338, + "num_tokens": 196550345.0, + "step": 1838 + }, + { + "epoch": 4.189281641961231, + "grad_norm": 2.625, + "learning_rate": 3.3615365852575276e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8801786154508591, + "num_tokens": 196658198.0, + "step": 1839 + }, + { + "epoch": 4.191562143671608, + "grad_norm": 5.03125, + "learning_rate": 3.359768264179772e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8778839707374573, + "num_tokens": 196765684.0, + "step": 1840 + }, + { + "epoch": 4.193842645381984, + "grad_norm": 2.921875, + "learning_rate": 3.357999455114148e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8802204430103302, + "num_tokens": 196872724.0, + "step": 1841 + }, + { + "epoch": 4.196123147092361, + "grad_norm": 2.421875, + "learning_rate": 3.356230159064599e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8841566443443298, + "num_tokens": 196979553.0, + "step": 1842 + }, + { + "epoch": 4.198403648802737, + "grad_norm": 2.59375, + "learning_rate": 3.3544603770353407e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.8825344890356064, + "num_tokens": 197086673.0, + "step": 1843 + }, + { + "epoch": 4.200684150513113, + "grad_norm": 2.921875, + "learning_rate": 3.352690110030869e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.8824966847896576, + "num_tokens": 197193889.0, + "step": 1844 + }, + { + "epoch": 4.202964652223489, + "grad_norm": 2.96875, + "learning_rate": 3.350919359055953e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.8796708136796951, + "num_tokens": 197301220.0, + "step": 1845 + }, + { + "epoch": 4.205245153933865, + "grad_norm": 3.0, + "learning_rate": 3.3491481251156355e-06, + "loss": 0.5727, + "mean_token_accuracy": 0.8849449008703232, + "num_tokens": 197408502.0, + "step": 1846 + }, + { + "epoch": 4.2075256556442415, + "grad_norm": 2.71875, + "learning_rate": 3.347376409215236e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.8793286383152008, + "num_tokens": 197515859.0, + "step": 1847 + }, + { + "epoch": 4.209806157354618, + "grad_norm": 2.546875, + "learning_rate": 3.345604212360346e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8793947696685791, + "num_tokens": 197622609.0, + "step": 1848 + }, + { + "epoch": 4.212086659064994, + "grad_norm": 4.5, + "learning_rate": 3.3438315355568295e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8782702833414078, + "num_tokens": 197729199.0, + "step": 1849 + }, + { + "epoch": 4.214367160775371, + "grad_norm": 4.71875, + "learning_rate": 3.3420583798108253e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8802850246429443, + "num_tokens": 197835966.0, + "step": 1850 + }, + { + "epoch": 4.216647662485747, + "grad_norm": 3.84375, + "learning_rate": 3.34028474612874e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.8822648972272873, + "num_tokens": 197942829.0, + "step": 1851 + }, + { + "epoch": 4.218928164196123, + "grad_norm": 5.375, + "learning_rate": 3.338510635517256e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.8766555935144424, + "num_tokens": 198050288.0, + "step": 1852 + }, + { + "epoch": 4.2212086659065, + "grad_norm": 6.46875, + "learning_rate": 3.3367360489833236e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8814980983734131, + "num_tokens": 198158005.0, + "step": 1853 + }, + { + "epoch": 4.223489167616876, + "grad_norm": 8.0625, + "learning_rate": 3.3349609875341626e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.882731556892395, + "num_tokens": 198265125.0, + "step": 1854 + }, + { + "epoch": 4.225769669327252, + "grad_norm": 3.390625, + "learning_rate": 3.3331854521772656e-06, + "loss": 0.558, + "mean_token_accuracy": 0.8883755952119827, + "num_tokens": 198371867.0, + "step": 1855 + }, + { + "epoch": 4.228050171037628, + "grad_norm": 2.375, + "learning_rate": 3.3314094439203903e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8845670074224472, + "num_tokens": 198478602.0, + "step": 1856 + }, + { + "epoch": 4.230330672748004, + "grad_norm": 4.1875, + "learning_rate": 3.3296329637715662e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8805161267518997, + "num_tokens": 198585016.0, + "step": 1857 + }, + { + "epoch": 4.2326111744583805, + "grad_norm": 3.46875, + "learning_rate": 3.3278560127390892e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8802484422922134, + "num_tokens": 198692273.0, + "step": 1858 + }, + { + "epoch": 4.234891676168757, + "grad_norm": 4.6875, + "learning_rate": 3.32607859183152e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8806602954864502, + "num_tokens": 198798563.0, + "step": 1859 + }, + { + "epoch": 4.237172177879134, + "grad_norm": 2.34375, + "learning_rate": 3.3243007020576917e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8770420551300049, + "num_tokens": 198905412.0, + "step": 1860 + }, + { + "epoch": 4.23945267958951, + "grad_norm": 6.125, + "learning_rate": 3.322522344426698e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.883165031671524, + "num_tokens": 199012136.0, + "step": 1861 + }, + { + "epoch": 4.241733181299886, + "grad_norm": 2.734375, + "learning_rate": 3.320743519947901e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.880427822470665, + "num_tokens": 199119481.0, + "step": 1862 + }, + { + "epoch": 4.244013683010262, + "grad_norm": 6.875, + "learning_rate": 3.318964229630927e-06, + "loss": 0.5902, + "mean_token_accuracy": 0.8777157664299011, + "num_tokens": 199226442.0, + "step": 1863 + }, + { + "epoch": 4.246294184720639, + "grad_norm": 3.453125, + "learning_rate": 3.3171844744856675e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8849682509899139, + "num_tokens": 199333370.0, + "step": 1864 + }, + { + "epoch": 4.248574686431015, + "grad_norm": 3.953125, + "learning_rate": 3.3154042555222758e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8795311897993088, + "num_tokens": 199440819.0, + "step": 1865 + }, + { + "epoch": 4.250855188141391, + "grad_norm": 4.8125, + "learning_rate": 3.3136235737511715e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8744902908802032, + "num_tokens": 199547464.0, + "step": 1866 + }, + { + "epoch": 4.253135689851767, + "grad_norm": 2.90625, + "learning_rate": 3.3118424301830343e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8773903250694275, + "num_tokens": 199654391.0, + "step": 1867 + }, + { + "epoch": 4.255416191562143, + "grad_norm": 3.265625, + "learning_rate": 3.310060825828807e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8800706118345261, + "num_tokens": 199761255.0, + "step": 1868 + }, + { + "epoch": 4.2576966932725195, + "grad_norm": 2.828125, + "learning_rate": 3.3082787616996938e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8814914971590042, + "num_tokens": 199868966.0, + "step": 1869 + }, + { + "epoch": 4.259977194982897, + "grad_norm": 2.453125, + "learning_rate": 3.3064962388071586e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8803159892559052, + "num_tokens": 199975974.0, + "step": 1870 + }, + { + "epoch": 4.262257696693273, + "grad_norm": 2.671875, + "learning_rate": 3.3047132581629297e-06, + "loss": 0.5617, + "mean_token_accuracy": 0.8839969784021378, + "num_tokens": 200083683.0, + "step": 1871 + }, + { + "epoch": 4.264538198403649, + "grad_norm": 2.765625, + "learning_rate": 3.3029298207789907e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8819585740566254, + "num_tokens": 200191486.0, + "step": 1872 + }, + { + "epoch": 4.266818700114025, + "grad_norm": 6.03125, + "learning_rate": 3.301145927667586e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8768231570720673, + "num_tokens": 200298545.0, + "step": 1873 + }, + { + "epoch": 4.269099201824401, + "grad_norm": 2.6875, + "learning_rate": 3.2993615798412204e-06, + "loss": 0.584, + "mean_token_accuracy": 0.8820153176784515, + "num_tokens": 200405570.0, + "step": 1874 + }, + { + "epoch": 4.271379703534778, + "grad_norm": 4.9375, + "learning_rate": 3.297576778312654e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.8819970637559891, + "num_tokens": 200512849.0, + "step": 1875 + }, + { + "epoch": 4.273660205245154, + "grad_norm": 3.578125, + "learning_rate": 3.295791524094906e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.878609761595726, + "num_tokens": 200620132.0, + "step": 1876 + }, + { + "epoch": 4.27594070695553, + "grad_norm": 4.8125, + "learning_rate": 3.294005818201252e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8842368423938751, + "num_tokens": 200726719.0, + "step": 1877 + }, + { + "epoch": 4.278221208665906, + "grad_norm": 2.5625, + "learning_rate": 3.2922196616452253e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8771747201681137, + "num_tokens": 200833547.0, + "step": 1878 + }, + { + "epoch": 4.280501710376283, + "grad_norm": 2.515625, + "learning_rate": 3.2904330554406126e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.880841001868248, + "num_tokens": 200941003.0, + "step": 1879 + }, + { + "epoch": 4.282782212086659, + "grad_norm": 2.71875, + "learning_rate": 3.288646000601457e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8747004866600037, + "num_tokens": 201048114.0, + "step": 1880 + }, + { + "epoch": 4.285062713797036, + "grad_norm": 5.0625, + "learning_rate": 3.286858498142057e-06, + "loss": 0.5821, + "mean_token_accuracy": 0.8779664039611816, + "num_tokens": 201155149.0, + "step": 1881 + }, + { + "epoch": 4.287343215507412, + "grad_norm": 3.65625, + "learning_rate": 3.285070549076965e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.877086952328682, + "num_tokens": 201262291.0, + "step": 1882 + }, + { + "epoch": 4.289623717217788, + "grad_norm": 3.5625, + "learning_rate": 3.283282154420985e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8825719207525253, + "num_tokens": 201369699.0, + "step": 1883 + }, + { + "epoch": 4.291904218928164, + "grad_norm": 4.0625, + "learning_rate": 3.2814933151891766e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.881410762667656, + "num_tokens": 201476718.0, + "step": 1884 + }, + { + "epoch": 4.29418472063854, + "grad_norm": 2.859375, + "learning_rate": 3.2797040323968493e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8789144903421402, + "num_tokens": 201583789.0, + "step": 1885 + }, + { + "epoch": 4.296465222348917, + "grad_norm": 3.515625, + "learning_rate": 3.277914307059566e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.886613667011261, + "num_tokens": 201690953.0, + "step": 1886 + }, + { + "epoch": 4.298745724059293, + "grad_norm": 3.28125, + "learning_rate": 3.276124140193141e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.882378563284874, + "num_tokens": 201798723.0, + "step": 1887 + }, + { + "epoch": 4.301026225769669, + "grad_norm": 2.71875, + "learning_rate": 3.274333532813637e-06, + "loss": 0.6009, + "mean_token_accuracy": 0.8767693191766739, + "num_tokens": 201905583.0, + "step": 1888 + }, + { + "epoch": 4.303306727480045, + "grad_norm": 3.921875, + "learning_rate": 3.272542485937369e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.89011649787426, + "num_tokens": 202012658.0, + "step": 1889 + }, + { + "epoch": 4.305587229190422, + "grad_norm": 2.5, + "learning_rate": 3.2707510005809005e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.8810762017965317, + "num_tokens": 202120313.0, + "step": 1890 + }, + { + "epoch": 4.307867730900798, + "grad_norm": 3.1875, + "learning_rate": 3.2689590777610443e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8805640786886215, + "num_tokens": 202227525.0, + "step": 1891 + }, + { + "epoch": 4.310148232611175, + "grad_norm": 6.125, + "learning_rate": 3.267166718494861e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8817348927259445, + "num_tokens": 202334247.0, + "step": 1892 + }, + { + "epoch": 4.312428734321551, + "grad_norm": 4.03125, + "learning_rate": 3.265373923799658e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8808832913637161, + "num_tokens": 202441565.0, + "step": 1893 + }, + { + "epoch": 4.314709236031927, + "grad_norm": 2.71875, + "learning_rate": 3.263580694692992e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.882627323269844, + "num_tokens": 202549008.0, + "step": 1894 + }, + { + "epoch": 4.316989737742303, + "grad_norm": 3.59375, + "learning_rate": 3.261787032192666e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8810908943414688, + "num_tokens": 202655622.0, + "step": 1895 + }, + { + "epoch": 4.319270239452679, + "grad_norm": 4.625, + "learning_rate": 3.259992937316727e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.882026731967926, + "num_tokens": 202761692.0, + "step": 1896 + }, + { + "epoch": 4.321550741163056, + "grad_norm": 5.1875, + "learning_rate": 3.258198411083469e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8775666505098343, + "num_tokens": 202868233.0, + "step": 1897 + }, + { + "epoch": 4.323831242873432, + "grad_norm": 8.4375, + "learning_rate": 3.2564034545114308e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.8776109367609024, + "num_tokens": 202975704.0, + "step": 1898 + }, + { + "epoch": 4.326111744583809, + "grad_norm": 2.59375, + "learning_rate": 3.2546080686193947e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8826450109481812, + "num_tokens": 203082943.0, + "step": 1899 + }, + { + "epoch": 4.328392246294185, + "grad_norm": 2.703125, + "learning_rate": 3.2528122544263873e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8836265951395035, + "num_tokens": 203189879.0, + "step": 1900 + }, + { + "epoch": 4.330672748004561, + "grad_norm": 3.515625, + "learning_rate": 3.251016012951678e-06, + "loss": 0.5576, + "mean_token_accuracy": 0.8853756338357925, + "num_tokens": 203296924.0, + "step": 1901 + }, + { + "epoch": 4.3329532497149374, + "grad_norm": 8.4375, + "learning_rate": 3.2492193452147774e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.8769858479499817, + "num_tokens": 203403322.0, + "step": 1902 + }, + { + "epoch": 4.335233751425314, + "grad_norm": 5.5, + "learning_rate": 3.247422252235442e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8848748654127121, + "num_tokens": 203510621.0, + "step": 1903 + }, + { + "epoch": 4.33751425313569, + "grad_norm": 2.421875, + "learning_rate": 3.245624735033665e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8844299167394638, + "num_tokens": 203617576.0, + "step": 1904 + }, + { + "epoch": 4.339794754846066, + "grad_norm": 4.65625, + "learning_rate": 3.2438267946296836e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8842675089836121, + "num_tokens": 203724726.0, + "step": 1905 + }, + { + "epoch": 4.342075256556442, + "grad_norm": 3.109375, + "learning_rate": 3.242028432043974e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8774498254060745, + "num_tokens": 203832260.0, + "step": 1906 + }, + { + "epoch": 4.344355758266818, + "grad_norm": 2.953125, + "learning_rate": 3.2402296482972513e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.8836204558610916, + "num_tokens": 203940076.0, + "step": 1907 + }, + { + "epoch": 4.346636259977195, + "grad_norm": 7.75, + "learning_rate": 3.238430444410471e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8820386677980423, + "num_tokens": 204047654.0, + "step": 1908 + }, + { + "epoch": 4.348916761687571, + "grad_norm": 5.09375, + "learning_rate": 3.2366308214048262e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8806447833776474, + "num_tokens": 204154623.0, + "step": 1909 + }, + { + "epoch": 4.351197263397948, + "grad_norm": 5.03125, + "learning_rate": 3.2348307803017493e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.886213019490242, + "num_tokens": 204261536.0, + "step": 1910 + }, + { + "epoch": 4.353477765108324, + "grad_norm": 4.6875, + "learning_rate": 3.2330303221229078e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.8766489624977112, + "num_tokens": 204368307.0, + "step": 1911 + }, + { + "epoch": 4.3557582668187, + "grad_norm": 2.6875, + "learning_rate": 3.231229447890206e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.8825319856405258, + "num_tokens": 204475273.0, + "step": 1912 + }, + { + "epoch": 4.3580387685290765, + "grad_norm": 4.21875, + "learning_rate": 3.229428158625787e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8858949691057205, + "num_tokens": 204582592.0, + "step": 1913 + }, + { + "epoch": 4.360319270239453, + "grad_norm": 2.953125, + "learning_rate": 3.2276264553520275e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8835407346487045, + "num_tokens": 204689248.0, + "step": 1914 + }, + { + "epoch": 4.362599771949829, + "grad_norm": 3.09375, + "learning_rate": 3.2258243390915397e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8794312477111816, + "num_tokens": 204796588.0, + "step": 1915 + }, + { + "epoch": 4.364880273660205, + "grad_norm": 3.546875, + "learning_rate": 3.2240218108671683e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8816234618425369, + "num_tokens": 204903575.0, + "step": 1916 + }, + { + "epoch": 4.367160775370581, + "grad_norm": 4.625, + "learning_rate": 3.2222188717019965e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8821093142032623, + "num_tokens": 205010359.0, + "step": 1917 + }, + { + "epoch": 4.369441277080957, + "grad_norm": 3.53125, + "learning_rate": 3.220415522619335e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.8760864734649658, + "num_tokens": 205117682.0, + "step": 1918 + }, + { + "epoch": 4.3717217787913345, + "grad_norm": 2.34375, + "learning_rate": 3.218611764642732e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8813271969556808, + "num_tokens": 205224758.0, + "step": 1919 + }, + { + "epoch": 4.374002280501711, + "grad_norm": 3.375, + "learning_rate": 3.2168075987959633e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.8754995763301849, + "num_tokens": 205331474.0, + "step": 1920 + }, + { + "epoch": 4.376282782212087, + "grad_norm": 4.0, + "learning_rate": 3.2150030261030414e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8807307332754135, + "num_tokens": 205438379.0, + "step": 1921 + }, + { + "epoch": 4.378563283922463, + "grad_norm": 2.921875, + "learning_rate": 3.2131980475882053e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.8842781186103821, + "num_tokens": 205545207.0, + "step": 1922 + }, + { + "epoch": 4.380843785632839, + "grad_norm": 3.328125, + "learning_rate": 3.2113926642759256e-06, + "loss": 0.605, + "mean_token_accuracy": 0.8773124665021896, + "num_tokens": 205651526.0, + "step": 1923 + }, + { + "epoch": 4.3831242873432155, + "grad_norm": 4.15625, + "learning_rate": 3.2095868771909037e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8833547234535217, + "num_tokens": 205758217.0, + "step": 1924 + }, + { + "epoch": 4.385404789053592, + "grad_norm": 2.9375, + "learning_rate": 3.2077806873580696e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8848164230585098, + "num_tokens": 205865218.0, + "step": 1925 + }, + { + "epoch": 4.387685290763968, + "grad_norm": 5.25, + "learning_rate": 3.205974095802582e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.8819639384746552, + "num_tokens": 205971573.0, + "step": 1926 + }, + { + "epoch": 4.389965792474344, + "grad_norm": 6.0, + "learning_rate": 3.204167103549827e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8817443251609802, + "num_tokens": 206078630.0, + "step": 1927 + }, + { + "epoch": 4.39224629418472, + "grad_norm": 4.15625, + "learning_rate": 3.2023597116254175e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8799740672111511, + "num_tokens": 206185346.0, + "step": 1928 + }, + { + "epoch": 4.394526795895097, + "grad_norm": 2.625, + "learning_rate": 3.2005519210551955e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.885019987821579, + "num_tokens": 206292483.0, + "step": 1929 + }, + { + "epoch": 4.3968072976054735, + "grad_norm": 2.5625, + "learning_rate": 3.1987437328652287e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.8824569880962372, + "num_tokens": 206399488.0, + "step": 1930 + }, + { + "epoch": 4.39908779931585, + "grad_norm": 2.671875, + "learning_rate": 3.196935148081808e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8730659335851669, + "num_tokens": 206506633.0, + "step": 1931 + }, + { + "epoch": 4.401368301026226, + "grad_norm": 3.140625, + "learning_rate": 3.1951261677314526e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8806923627853394, + "num_tokens": 206614034.0, + "step": 1932 + }, + { + "epoch": 4.403648802736602, + "grad_norm": 2.546875, + "learning_rate": 3.1933167928409046e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.882137656211853, + "num_tokens": 206721766.0, + "step": 1933 + }, + { + "epoch": 4.405929304446978, + "grad_norm": 2.46875, + "learning_rate": 3.1915070244371295e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8834485858678818, + "num_tokens": 206829226.0, + "step": 1934 + }, + { + "epoch": 4.4082098061573545, + "grad_norm": 3.078125, + "learning_rate": 3.1896968635473174e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8820080310106277, + "num_tokens": 206936587.0, + "step": 1935 + }, + { + "epoch": 4.410490307867731, + "grad_norm": 2.4375, + "learning_rate": 3.187886311198881e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8846455514431, + "num_tokens": 207043902.0, + "step": 1936 + }, + { + "epoch": 4.412770809578107, + "grad_norm": 4.75, + "learning_rate": 3.1860753684194536e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8813911080360413, + "num_tokens": 207151288.0, + "step": 1937 + }, + { + "epoch": 4.415051311288483, + "grad_norm": 3.78125, + "learning_rate": 3.1842640362368932e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.878994882106781, + "num_tokens": 207258999.0, + "step": 1938 + }, + { + "epoch": 4.41733181299886, + "grad_norm": 3.046875, + "learning_rate": 3.182452315679276e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8886394798755646, + "num_tokens": 207366568.0, + "step": 1939 + }, + { + "epoch": 4.419612314709236, + "grad_norm": 2.4375, + "learning_rate": 3.1806402077748987e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8816164880990982, + "num_tokens": 207473843.0, + "step": 1940 + }, + { + "epoch": 4.4218928164196125, + "grad_norm": 2.84375, + "learning_rate": 3.178827713552281e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8799097836017609, + "num_tokens": 207580995.0, + "step": 1941 + }, + { + "epoch": 4.424173318129989, + "grad_norm": 2.984375, + "learning_rate": 3.177014834040158e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.883696660399437, + "num_tokens": 207688161.0, + "step": 1942 + }, + { + "epoch": 4.426453819840365, + "grad_norm": 2.5625, + "learning_rate": 3.1752015702674855e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8793037235736847, + "num_tokens": 207795164.0, + "step": 1943 + }, + { + "epoch": 4.428734321550741, + "grad_norm": 2.5, + "learning_rate": 3.173387923263437e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.8862407803535461, + "num_tokens": 207902155.0, + "step": 1944 + }, + { + "epoch": 4.431014823261117, + "grad_norm": 3.453125, + "learning_rate": 3.1715738940574032e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8828154355287552, + "num_tokens": 208009297.0, + "step": 1945 + }, + { + "epoch": 4.4332953249714935, + "grad_norm": 4.0625, + "learning_rate": 3.1697594836789924e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8778323084115982, + "num_tokens": 208116608.0, + "step": 1946 + }, + { + "epoch": 4.43557582668187, + "grad_norm": 2.59375, + "learning_rate": 3.167944693158029e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8808430731296539, + "num_tokens": 208223390.0, + "step": 1947 + }, + { + "epoch": 4.437856328392247, + "grad_norm": 3.578125, + "learning_rate": 3.166129523524553e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.8762641549110413, + "num_tokens": 208330138.0, + "step": 1948 + }, + { + "epoch": 4.440136830102623, + "grad_norm": 2.890625, + "learning_rate": 3.1643139758088194e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8787393867969513, + "num_tokens": 208437552.0, + "step": 1949 + }, + { + "epoch": 4.442417331812999, + "grad_norm": 4.28125, + "learning_rate": 3.1624980510412984e-06, + "loss": 0.5987, + "mean_token_accuracy": 0.8784570544958115, + "num_tokens": 208544652.0, + "step": 1950 + }, + { + "epoch": 4.444697833523375, + "grad_norm": 3.484375, + "learning_rate": 3.160681750252674e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8795022666454315, + "num_tokens": 208651498.0, + "step": 1951 + }, + { + "epoch": 4.4469783352337515, + "grad_norm": 3.78125, + "learning_rate": 3.1588650744738418e-06, + "loss": 0.5617, + "mean_token_accuracy": 0.8832292854785919, + "num_tokens": 208758479.0, + "step": 1952 + }, + { + "epoch": 4.449258836944128, + "grad_norm": 2.859375, + "learning_rate": 3.1570480247359147e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8820718824863434, + "num_tokens": 208865610.0, + "step": 1953 + }, + { + "epoch": 4.451539338654504, + "grad_norm": 2.984375, + "learning_rate": 3.155230602070213e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.879115030169487, + "num_tokens": 208972259.0, + "step": 1954 + }, + { + "epoch": 4.45381984036488, + "grad_norm": 3.09375, + "learning_rate": 3.153412807508271e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8792910128831863, + "num_tokens": 209078252.0, + "step": 1955 + }, + { + "epoch": 4.456100342075256, + "grad_norm": 2.96875, + "learning_rate": 3.1515946420818343e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.8846887648105621, + "num_tokens": 209185333.0, + "step": 1956 + }, + { + "epoch": 4.4583808437856325, + "grad_norm": 2.65625, + "learning_rate": 3.1497761068228585e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8841688334941864, + "num_tokens": 209292465.0, + "step": 1957 + }, + { + "epoch": 4.460661345496009, + "grad_norm": 3.5, + "learning_rate": 3.1479572027635085e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.8820602297782898, + "num_tokens": 209399257.0, + "step": 1958 + }, + { + "epoch": 4.462941847206386, + "grad_norm": 4.15625, + "learning_rate": 3.1461379309361594e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8786464035511017, + "num_tokens": 209506400.0, + "step": 1959 + }, + { + "epoch": 4.465222348916762, + "grad_norm": 2.671875, + "learning_rate": 3.144318292373395e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8865722268819809, + "num_tokens": 209613571.0, + "step": 1960 + }, + { + "epoch": 4.467502850627138, + "grad_norm": 3.4375, + "learning_rate": 3.142498288108007e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8826761692762375, + "num_tokens": 209720956.0, + "step": 1961 + }, + { + "epoch": 4.469783352337514, + "grad_norm": 2.84375, + "learning_rate": 3.1406779191729954e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.8883082419633865, + "num_tokens": 209827663.0, + "step": 1962 + }, + { + "epoch": 4.4720638540478905, + "grad_norm": 3.71875, + "learning_rate": 3.1388571866015645e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8810272365808487, + "num_tokens": 209934611.0, + "step": 1963 + }, + { + "epoch": 4.474344355758267, + "grad_norm": 3.703125, + "learning_rate": 3.1370360914271286e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8840647041797638, + "num_tokens": 210041595.0, + "step": 1964 + }, + { + "epoch": 4.476624857468643, + "grad_norm": 3.5625, + "learning_rate": 3.1352146346833057e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.877694696187973, + "num_tokens": 210148918.0, + "step": 1965 + }, + { + "epoch": 4.478905359179019, + "grad_norm": 2.71875, + "learning_rate": 3.133392817403919e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8824283182621002, + "num_tokens": 210256121.0, + "step": 1966 + }, + { + "epoch": 4.481185860889395, + "grad_norm": 3.1875, + "learning_rate": 3.131570640622998e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8796381056308746, + "num_tokens": 210363033.0, + "step": 1967 + }, + { + "epoch": 4.483466362599772, + "grad_norm": 2.734375, + "learning_rate": 3.1297481053747737e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8865970373153687, + "num_tokens": 210470814.0, + "step": 1968 + }, + { + "epoch": 4.485746864310149, + "grad_norm": 2.515625, + "learning_rate": 3.127925212693682e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8791713118553162, + "num_tokens": 210577490.0, + "step": 1969 + }, + { + "epoch": 4.488027366020525, + "grad_norm": 3.21875, + "learning_rate": 3.1261019636143636e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.8812829405069351, + "num_tokens": 210684376.0, + "step": 1970 + }, + { + "epoch": 4.490307867730901, + "grad_norm": 2.484375, + "learning_rate": 3.124278359171657e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.8833597153425217, + "num_tokens": 210791620.0, + "step": 1971 + }, + { + "epoch": 4.492588369441277, + "grad_norm": 3.671875, + "learning_rate": 3.122454400400606e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8824751228094101, + "num_tokens": 210898333.0, + "step": 1972 + }, + { + "epoch": 4.494868871151653, + "grad_norm": 2.34375, + "learning_rate": 3.1206300883364547e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8802805244922638, + "num_tokens": 211005919.0, + "step": 1973 + }, + { + "epoch": 4.4971493728620295, + "grad_norm": 2.59375, + "learning_rate": 3.1188054240146463e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.880774512887001, + "num_tokens": 211113271.0, + "step": 1974 + }, + { + "epoch": 4.499429874572406, + "grad_norm": 2.59375, + "learning_rate": 3.1169804084708267e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.880780965089798, + "num_tokens": 211220394.0, + "step": 1975 + }, + { + "epoch": 4.501710376282782, + "grad_norm": 3.796875, + "learning_rate": 3.1151550427408383e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8800267279148102, + "num_tokens": 211327515.0, + "step": 1976 + }, + { + "epoch": 4.503990877993158, + "grad_norm": 4.46875, + "learning_rate": 3.1133293278607228e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.882025346159935, + "num_tokens": 211434652.0, + "step": 1977 + }, + { + "epoch": 4.506271379703534, + "grad_norm": 2.671875, + "learning_rate": 3.1115032648667224e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.8816207945346832, + "num_tokens": 211542180.0, + "step": 1978 + }, + { + "epoch": 4.508551881413911, + "grad_norm": 3.25, + "learning_rate": 3.1096768547952743e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8829948008060455, + "num_tokens": 211649219.0, + "step": 1979 + }, + { + "epoch": 4.510832383124288, + "grad_norm": 2.5, + "learning_rate": 3.1078500986830134e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8840111643075943, + "num_tokens": 211756443.0, + "step": 1980 + }, + { + "epoch": 4.510832383124288, + "eval_loss": 0.5894109606742859, + "eval_mean_token_accuracy": 0.8794237710677172, + "eval_num_tokens": 211756443.0, + "eval_runtime": 58.6388, + "eval_samples_per_second": 142.994, + "eval_steps_per_second": 4.485, + "step": 1980 + }, + { + "epoch": 4.513112884834664, + "grad_norm": 3.09375, + "learning_rate": 3.1060229975667716e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8810431063175201, + "num_tokens": 211863529.0, + "step": 1981 + }, + { + "epoch": 4.51539338654504, + "grad_norm": 5.625, + "learning_rate": 3.104195552483576e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8787284940481186, + "num_tokens": 211970403.0, + "step": 1982 + }, + { + "epoch": 4.517673888255416, + "grad_norm": 5.21875, + "learning_rate": 3.102367764470649e-06, + "loss": 0.568, + "mean_token_accuracy": 0.8812828063964844, + "num_tokens": 212077504.0, + "step": 1983 + }, + { + "epoch": 4.519954389965792, + "grad_norm": 3.828125, + "learning_rate": 3.1005396345654087e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8794949948787689, + "num_tokens": 212185300.0, + "step": 1984 + }, + { + "epoch": 4.5222348916761685, + "grad_norm": 4.25, + "learning_rate": 3.0987111638054657e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8824008107185364, + "num_tokens": 212292097.0, + "step": 1985 + }, + { + "epoch": 4.524515393386545, + "grad_norm": 5.65625, + "learning_rate": 3.0968823532286246e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8847775161266327, + "num_tokens": 212399289.0, + "step": 1986 + }, + { + "epoch": 4.526795895096921, + "grad_norm": 4.15625, + "learning_rate": 3.095053203872883e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8831916600465775, + "num_tokens": 212506647.0, + "step": 1987 + }, + { + "epoch": 4.529076396807298, + "grad_norm": 2.453125, + "learning_rate": 3.0932237167764306e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.883448526263237, + "num_tokens": 212613620.0, + "step": 1988 + }, + { + "epoch": 4.531356898517674, + "grad_norm": 4.46875, + "learning_rate": 3.0913938929776493e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8791800290346146, + "num_tokens": 212720388.0, + "step": 1989 + }, + { + "epoch": 4.53363740022805, + "grad_norm": 2.703125, + "learning_rate": 3.0895637335151117e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8825742602348328, + "num_tokens": 212827508.0, + "step": 1990 + }, + { + "epoch": 4.535917901938427, + "grad_norm": 4.59375, + "learning_rate": 3.0877332394275806e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.878256767988205, + "num_tokens": 212933843.0, + "step": 1991 + }, + { + "epoch": 4.538198403648803, + "grad_norm": 2.734375, + "learning_rate": 3.08590241175401e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8803077787160873, + "num_tokens": 213040798.0, + "step": 1992 + }, + { + "epoch": 4.540478905359179, + "grad_norm": 5.1875, + "learning_rate": 3.0840712515335412e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8778508752584457, + "num_tokens": 213147454.0, + "step": 1993 + }, + { + "epoch": 4.542759407069555, + "grad_norm": 2.921875, + "learning_rate": 3.0822397598055065e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8773138970136642, + "num_tokens": 213255022.0, + "step": 1994 + }, + { + "epoch": 4.545039908779931, + "grad_norm": 4.375, + "learning_rate": 3.080407937609424e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8787964135408401, + "num_tokens": 213362436.0, + "step": 1995 + }, + { + "epoch": 4.5473204104903076, + "grad_norm": 2.609375, + "learning_rate": 3.0785757859850025e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8769951462745667, + "num_tokens": 213469549.0, + "step": 1996 + }, + { + "epoch": 4.549600912200685, + "grad_norm": 3.03125, + "learning_rate": 3.0767433059721338e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.8808469474315643, + "num_tokens": 213576382.0, + "step": 1997 + }, + { + "epoch": 4.55188141391106, + "grad_norm": 2.921875, + "learning_rate": 3.074910498610899e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.887230396270752, + "num_tokens": 213683677.0, + "step": 1998 + }, + { + "epoch": 4.554161915621437, + "grad_norm": 5.03125, + "learning_rate": 3.0730773649415647e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8774087131023407, + "num_tokens": 213790372.0, + "step": 1999 + }, + { + "epoch": 4.556442417331813, + "grad_norm": 6.09375, + "learning_rate": 3.0712439060045818e-06, + "loss": 0.5667, + "mean_token_accuracy": 0.884253740310669, + "num_tokens": 213897106.0, + "step": 2000 + }, + { + "epoch": 4.558722919042189, + "grad_norm": 5.25, + "learning_rate": 3.069410122840585e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8797920346260071, + "num_tokens": 214004255.0, + "step": 2001 + }, + { + "epoch": 4.561003420752566, + "grad_norm": 2.546875, + "learning_rate": 3.0675760164903972e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8811387270689011, + "num_tokens": 214111233.0, + "step": 2002 + }, + { + "epoch": 4.563283922462942, + "grad_norm": 3.125, + "learning_rate": 3.065741587995019e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8835022449493408, + "num_tokens": 214218120.0, + "step": 2003 + }, + { + "epoch": 4.565564424173318, + "grad_norm": 2.984375, + "learning_rate": 3.0639068383956373e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.880307137966156, + "num_tokens": 214324888.0, + "step": 2004 + }, + { + "epoch": 4.567844925883694, + "grad_norm": 2.640625, + "learning_rate": 3.062071768733621e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.886817216873169, + "num_tokens": 214432054.0, + "step": 2005 + }, + { + "epoch": 4.57012542759407, + "grad_norm": 3.8125, + "learning_rate": 3.0602363800505198e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.882687121629715, + "num_tokens": 214539517.0, + "step": 2006 + }, + { + "epoch": 4.572405929304447, + "grad_norm": 3.015625, + "learning_rate": 3.0584006733880656e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8774117529392242, + "num_tokens": 214646420.0, + "step": 2007 + }, + { + "epoch": 4.574686431014824, + "grad_norm": 2.890625, + "learning_rate": 3.0565646497881697e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8867736458778381, + "num_tokens": 214753267.0, + "step": 2008 + }, + { + "epoch": 4.5769669327252, + "grad_norm": 3.078125, + "learning_rate": 3.0547283102929228e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8841065913438797, + "num_tokens": 214860275.0, + "step": 2009 + }, + { + "epoch": 4.579247434435576, + "grad_norm": 3.109375, + "learning_rate": 3.0528916559445967e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8801997601985931, + "num_tokens": 214967499.0, + "step": 2010 + }, + { + "epoch": 4.581527936145952, + "grad_norm": 2.390625, + "learning_rate": 3.05105468778564e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8840168863534927, + "num_tokens": 215075121.0, + "step": 2011 + }, + { + "epoch": 4.583808437856328, + "grad_norm": 5.3125, + "learning_rate": 3.049217406858681e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.8759740740060806, + "num_tokens": 215181867.0, + "step": 2012 + }, + { + "epoch": 4.586088939566705, + "grad_norm": 4.09375, + "learning_rate": 3.047379814206526e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8838594704866409, + "num_tokens": 215288922.0, + "step": 2013 + }, + { + "epoch": 4.588369441277081, + "grad_norm": 2.546875, + "learning_rate": 3.0455419108721556e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8816310912370682, + "num_tokens": 215395792.0, + "step": 2014 + }, + { + "epoch": 4.590649942987457, + "grad_norm": 2.953125, + "learning_rate": 3.043703697898728e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8817844688892365, + "num_tokens": 215503184.0, + "step": 2015 + }, + { + "epoch": 4.592930444697833, + "grad_norm": 2.234375, + "learning_rate": 3.041865176329579e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8872413337230682, + "num_tokens": 215610549.0, + "step": 2016 + }, + { + "epoch": 4.59521094640821, + "grad_norm": 3.609375, + "learning_rate": 3.040026347208217e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.882647380232811, + "num_tokens": 215718112.0, + "step": 2017 + }, + { + "epoch": 4.5974914481185865, + "grad_norm": 3.015625, + "learning_rate": 3.0381872115783256e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8802604079246521, + "num_tokens": 215825609.0, + "step": 2018 + }, + { + "epoch": 4.599771949828963, + "grad_norm": 3.140625, + "learning_rate": 3.0363477704837633e-06, + "loss": 0.552, + "mean_token_accuracy": 0.8875061571598053, + "num_tokens": 215932611.0, + "step": 2019 + }, + { + "epoch": 4.602052451539339, + "grad_norm": 3.1875, + "learning_rate": 3.034508024968561e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8848912864923477, + "num_tokens": 216040088.0, + "step": 2020 + }, + { + "epoch": 4.604332953249715, + "grad_norm": 4.1875, + "learning_rate": 3.032667976076923e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8811133801937103, + "num_tokens": 216147228.0, + "step": 2021 + }, + { + "epoch": 4.606613454960091, + "grad_norm": 2.6875, + "learning_rate": 3.0308276248532244e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8812815397977829, + "num_tokens": 216254056.0, + "step": 2022 + }, + { + "epoch": 4.608893956670467, + "grad_norm": 2.984375, + "learning_rate": 3.0289869723420144e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8829955011606216, + "num_tokens": 216361270.0, + "step": 2023 + }, + { + "epoch": 4.611174458380844, + "grad_norm": 2.46875, + "learning_rate": 3.027146019588012e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8840386420488358, + "num_tokens": 216468575.0, + "step": 2024 + }, + { + "epoch": 4.61345496009122, + "grad_norm": 5.53125, + "learning_rate": 3.025304767636105e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.8865272402763367, + "num_tokens": 216576234.0, + "step": 2025 + }, + { + "epoch": 4.615735461801596, + "grad_norm": 5.40625, + "learning_rate": 3.0234632175313537e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8783838450908661, + "num_tokens": 216683054.0, + "step": 2026 + }, + { + "epoch": 4.618015963511972, + "grad_norm": 2.703125, + "learning_rate": 3.0216213703189856e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.8805306851863861, + "num_tokens": 216790089.0, + "step": 2027 + }, + { + "epoch": 4.620296465222349, + "grad_norm": 2.46875, + "learning_rate": 3.019779227044398e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8831235766410828, + "num_tokens": 216897778.0, + "step": 2028 + }, + { + "epoch": 4.6225769669327255, + "grad_norm": 2.78125, + "learning_rate": 3.0179367887531567e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8810001909732819, + "num_tokens": 217004442.0, + "step": 2029 + }, + { + "epoch": 4.624857468643102, + "grad_norm": 2.515625, + "learning_rate": 3.016094056490993e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8854062855243683, + "num_tokens": 217111586.0, + "step": 2030 + }, + { + "epoch": 4.627137970353478, + "grad_norm": 2.40625, + "learning_rate": 3.0142510313038057e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8880758136510849, + "num_tokens": 217219168.0, + "step": 2031 + }, + { + "epoch": 4.629418472063854, + "grad_norm": 2.5625, + "learning_rate": 3.012407714237662e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8846538215875626, + "num_tokens": 217326211.0, + "step": 2032 + }, + { + "epoch": 4.63169897377423, + "grad_norm": 3.4375, + "learning_rate": 3.010564106338791e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8847674578428268, + "num_tokens": 217433707.0, + "step": 2033 + }, + { + "epoch": 4.633979475484606, + "grad_norm": 2.375, + "learning_rate": 3.0087202086535915e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8850553631782532, + "num_tokens": 217541577.0, + "step": 2034 + }, + { + "epoch": 4.636259977194983, + "grad_norm": 3.1875, + "learning_rate": 3.006876022228622e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.882301077246666, + "num_tokens": 217648764.0, + "step": 2035 + }, + { + "epoch": 4.638540478905359, + "grad_norm": 2.59375, + "learning_rate": 3.0050315481106074e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8841645568609238, + "num_tokens": 217755467.0, + "step": 2036 + }, + { + "epoch": 4.640820980615736, + "grad_norm": 3.390625, + "learning_rate": 3.0031867873464372e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8814872205257416, + "num_tokens": 217862349.0, + "step": 2037 + }, + { + "epoch": 4.643101482326112, + "grad_norm": 2.875, + "learning_rate": 3.00134174098316e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8733366578817368, + "num_tokens": 217968209.0, + "step": 2038 + }, + { + "epoch": 4.645381984036488, + "grad_norm": 2.578125, + "learning_rate": 2.999496410067989e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8875339925289154, + "num_tokens": 218076237.0, + "step": 2039 + }, + { + "epoch": 4.6476624857468645, + "grad_norm": 3.640625, + "learning_rate": 2.9976507956482996e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8795648515224457, + "num_tokens": 218183464.0, + "step": 2040 + }, + { + "epoch": 4.649942987457241, + "grad_norm": 2.46875, + "learning_rate": 2.9958048987716266e-06, + "loss": 0.6071, + "mean_token_accuracy": 0.8746256977319717, + "num_tokens": 218290055.0, + "step": 2041 + }, + { + "epoch": 4.652223489167617, + "grad_norm": 3.703125, + "learning_rate": 2.993958720485664e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.8757225871086121, + "num_tokens": 218397051.0, + "step": 2042 + }, + { + "epoch": 4.654503990877993, + "grad_norm": 2.828125, + "learning_rate": 2.9921122618382687e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8802944868803024, + "num_tokens": 218504413.0, + "step": 2043 + }, + { + "epoch": 4.656784492588369, + "grad_norm": 2.71875, + "learning_rate": 2.9902655238774537e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8826518207788467, + "num_tokens": 218611326.0, + "step": 2044 + }, + { + "epoch": 4.659064994298745, + "grad_norm": 3.78125, + "learning_rate": 2.988418507651392e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8791725039482117, + "num_tokens": 218718666.0, + "step": 2045 + }, + { + "epoch": 4.661345496009122, + "grad_norm": 3.28125, + "learning_rate": 2.9865712142084145e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8884040713310242, + "num_tokens": 218826339.0, + "step": 2046 + }, + { + "epoch": 4.663625997719498, + "grad_norm": 3.46875, + "learning_rate": 2.98472364459701e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8853495866060257, + "num_tokens": 218933583.0, + "step": 2047 + }, + { + "epoch": 4.665906499429875, + "grad_norm": 3.703125, + "learning_rate": 2.982875799865823e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.8821221739053726, + "num_tokens": 219040654.0, + "step": 2048 + }, + { + "epoch": 4.668187001140251, + "grad_norm": 3.328125, + "learning_rate": 2.9810276810636535e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8774379193782806, + "num_tokens": 219147467.0, + "step": 2049 + }, + { + "epoch": 4.670467502850627, + "grad_norm": 3.546875, + "learning_rate": 2.97917928923946e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.879958301782608, + "num_tokens": 219254877.0, + "step": 2050 + }, + { + "epoch": 4.6727480045610035, + "grad_norm": 2.578125, + "learning_rate": 2.977330625442352e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8850719779729843, + "num_tokens": 219362293.0, + "step": 2051 + }, + { + "epoch": 4.67502850627138, + "grad_norm": 4.09375, + "learning_rate": 2.9754816907215963e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8815977722406387, + "num_tokens": 219469187.0, + "step": 2052 + }, + { + "epoch": 4.677309007981756, + "grad_norm": 2.5625, + "learning_rate": 2.9736324861266125e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.8775934129953384, + "num_tokens": 219576287.0, + "step": 2053 + }, + { + "epoch": 4.679589509692132, + "grad_norm": 5.3125, + "learning_rate": 2.9717830127069734e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8794510513544083, + "num_tokens": 219683068.0, + "step": 2054 + }, + { + "epoch": 4.681870011402508, + "grad_norm": 2.796875, + "learning_rate": 2.969933271512404e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8821385949850082, + "num_tokens": 219790016.0, + "step": 2055 + }, + { + "epoch": 4.684150513112884, + "grad_norm": 2.890625, + "learning_rate": 2.9680832635927824e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8790445029735565, + "num_tokens": 219896728.0, + "step": 2056 + }, + { + "epoch": 4.6864310148232615, + "grad_norm": 2.75, + "learning_rate": 2.9662329899981375e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8832840174436569, + "num_tokens": 220004033.0, + "step": 2057 + }, + { + "epoch": 4.688711516533638, + "grad_norm": 3.3125, + "learning_rate": 2.964382451778648e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8850243538618088, + "num_tokens": 220111159.0, + "step": 2058 + }, + { + "epoch": 4.690992018244014, + "grad_norm": 3.984375, + "learning_rate": 2.9625316499846444e-06, + "loss": 0.582, + "mean_token_accuracy": 0.8815952986478806, + "num_tokens": 220218001.0, + "step": 2059 + }, + { + "epoch": 4.69327251995439, + "grad_norm": 2.8125, + "learning_rate": 2.9606805856666053e-06, + "loss": 0.6012, + "mean_token_accuracy": 0.8748088330030441, + "num_tokens": 220325346.0, + "step": 2060 + }, + { + "epoch": 4.695553021664766, + "grad_norm": 2.34375, + "learning_rate": 2.95882925987516e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8794578313827515, + "num_tokens": 220432554.0, + "step": 2061 + }, + { + "epoch": 4.6978335233751425, + "grad_norm": 3.390625, + "learning_rate": 2.9569776736610855e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8808020949363708, + "num_tokens": 220539630.0, + "step": 2062 + }, + { + "epoch": 4.700114025085519, + "grad_norm": 2.96875, + "learning_rate": 2.9551258280753046e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8800801783800125, + "num_tokens": 220646814.0, + "step": 2063 + }, + { + "epoch": 4.702394526795895, + "grad_norm": 3.8125, + "learning_rate": 2.953273724168891e-06, + "loss": 0.568, + "mean_token_accuracy": 0.8822778612375259, + "num_tokens": 220753926.0, + "step": 2064 + }, + { + "epoch": 4.704675028506271, + "grad_norm": 3.53125, + "learning_rate": 2.9514213629930614e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8794926106929779, + "num_tokens": 220860498.0, + "step": 2065 + }, + { + "epoch": 4.706955530216648, + "grad_norm": 2.890625, + "learning_rate": 2.949568745599182e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.8850435167551041, + "num_tokens": 220967240.0, + "step": 2066 + }, + { + "epoch": 4.7092360319270234, + "grad_norm": 4.21875, + "learning_rate": 2.9477158730387615e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8826223760843277, + "num_tokens": 221074023.0, + "step": 2067 + }, + { + "epoch": 4.7115165336374005, + "grad_norm": 2.90625, + "learning_rate": 2.945862746363455e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8868841081857681, + "num_tokens": 221181425.0, + "step": 2068 + }, + { + "epoch": 4.713797035347777, + "grad_norm": 3.4375, + "learning_rate": 2.944009366625061e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8801351636648178, + "num_tokens": 221288251.0, + "step": 2069 + }, + { + "epoch": 4.716077537058153, + "grad_norm": 2.765625, + "learning_rate": 2.942155734875523e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8805908411741257, + "num_tokens": 221395168.0, + "step": 2070 + }, + { + "epoch": 4.718358038768529, + "grad_norm": 6.375, + "learning_rate": 2.9403018521669256e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8779006004333496, + "num_tokens": 221502170.0, + "step": 2071 + }, + { + "epoch": 4.720638540478905, + "grad_norm": 3.71875, + "learning_rate": 2.938447719551498e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.8802389800548553, + "num_tokens": 221608850.0, + "step": 2072 + }, + { + "epoch": 4.7229190421892815, + "grad_norm": 3.3125, + "learning_rate": 2.9365933380816092e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8805797696113586, + "num_tokens": 221715991.0, + "step": 2073 + }, + { + "epoch": 4.725199543899658, + "grad_norm": 5.28125, + "learning_rate": 2.93473870880977e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8762219101190567, + "num_tokens": 221823317.0, + "step": 2074 + }, + { + "epoch": 4.727480045610034, + "grad_norm": 3.578125, + "learning_rate": 2.932883832788633e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8835586160421371, + "num_tokens": 221930410.0, + "step": 2075 + }, + { + "epoch": 4.72976054732041, + "grad_norm": 2.609375, + "learning_rate": 2.9310287110709895e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.881802573800087, + "num_tokens": 222037632.0, + "step": 2076 + }, + { + "epoch": 4.732041049030787, + "grad_norm": 3.828125, + "learning_rate": 2.9291733447097714e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8866459727287292, + "num_tokens": 222144620.0, + "step": 2077 + }, + { + "epoch": 4.734321550741163, + "grad_norm": 3.625, + "learning_rate": 2.927317734758047e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.879328653216362, + "num_tokens": 222251771.0, + "step": 2078 + }, + { + "epoch": 4.7366020524515395, + "grad_norm": 4.21875, + "learning_rate": 2.925461882269027e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8794370591640472, + "num_tokens": 222359272.0, + "step": 2079 + }, + { + "epoch": 4.738882554161916, + "grad_norm": 2.671875, + "learning_rate": 2.9236057882960567e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8789798021316528, + "num_tokens": 222466677.0, + "step": 2080 + }, + { + "epoch": 4.741163055872292, + "grad_norm": 3.09375, + "learning_rate": 2.921749453892618e-06, + "loss": 0.579, + "mean_token_accuracy": 0.8830148726701736, + "num_tokens": 222573940.0, + "step": 2081 + }, + { + "epoch": 4.743443557582668, + "grad_norm": 2.765625, + "learning_rate": 2.919892880112332e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8783297091722488, + "num_tokens": 222680851.0, + "step": 2082 + }, + { + "epoch": 4.745724059293044, + "grad_norm": 3.109375, + "learning_rate": 2.9180360680089542e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8823717683553696, + "num_tokens": 222787611.0, + "step": 2083 + }, + { + "epoch": 4.7480045610034205, + "grad_norm": 2.75, + "learning_rate": 2.9161790186363746e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8819119483232498, + "num_tokens": 222894602.0, + "step": 2084 + }, + { + "epoch": 4.750285062713797, + "grad_norm": 2.8125, + "learning_rate": 2.9143217330486186e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8828014135360718, + "num_tokens": 223001331.0, + "step": 2085 + }, + { + "epoch": 4.752565564424174, + "grad_norm": 3.078125, + "learning_rate": 2.9124642122998453e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.8834296762943268, + "num_tokens": 223108795.0, + "step": 2086 + }, + { + "epoch": 4.75484606613455, + "grad_norm": 4.53125, + "learning_rate": 2.9106064574443477e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8811309486627579, + "num_tokens": 223215668.0, + "step": 2087 + }, + { + "epoch": 4.757126567844926, + "grad_norm": 6.09375, + "learning_rate": 2.9087484695365523e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8807103782892227, + "num_tokens": 223322617.0, + "step": 2088 + }, + { + "epoch": 4.759407069555302, + "grad_norm": 5.6875, + "learning_rate": 2.906890249631017e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8810094445943832, + "num_tokens": 223429512.0, + "step": 2089 + }, + { + "epoch": 4.7616875712656785, + "grad_norm": 2.953125, + "learning_rate": 2.905031798782431e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.8895687907934189, + "num_tokens": 223536691.0, + "step": 2090 + }, + { + "epoch": 4.763968072976055, + "grad_norm": 2.75, + "learning_rate": 2.903173118045616e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8834663927555084, + "num_tokens": 223643826.0, + "step": 2091 + }, + { + "epoch": 4.766248574686431, + "grad_norm": 4.40625, + "learning_rate": 2.901314208475522e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8782705664634705, + "num_tokens": 223750903.0, + "step": 2092 + }, + { + "epoch": 4.768529076396807, + "grad_norm": 4.28125, + "learning_rate": 2.8994550711272317e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8804999589920044, + "num_tokens": 223858002.0, + "step": 2093 + }, + { + "epoch": 4.770809578107183, + "grad_norm": 2.921875, + "learning_rate": 2.897595707055954e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8842459321022034, + "num_tokens": 223965056.0, + "step": 2094 + }, + { + "epoch": 4.7730900798175595, + "grad_norm": 3.21875, + "learning_rate": 2.8957361173170297e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8789258599281311, + "num_tokens": 224071784.0, + "step": 2095 + }, + { + "epoch": 4.775370581527936, + "grad_norm": 3.015625, + "learning_rate": 2.893876302965925e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8828511238098145, + "num_tokens": 224178796.0, + "step": 2096 + }, + { + "epoch": 4.777651083238313, + "grad_norm": 2.96875, + "learning_rate": 2.8920162650582344e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8815236538648605, + "num_tokens": 224287157.0, + "step": 2097 + }, + { + "epoch": 4.779931584948689, + "grad_norm": 4.625, + "learning_rate": 2.8901560046496797e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8796006739139557, + "num_tokens": 224394174.0, + "step": 2098 + }, + { + "epoch": 4.782212086659065, + "grad_norm": 4.34375, + "learning_rate": 2.8882955227961098e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.8783185184001923, + "num_tokens": 224501173.0, + "step": 2099 + }, + { + "epoch": 4.784492588369441, + "grad_norm": 3.6875, + "learning_rate": 2.886434820553497e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8836559951305389, + "num_tokens": 224608530.0, + "step": 2100 + }, + { + "epoch": 4.7867730900798175, + "grad_norm": 3.25, + "learning_rate": 2.884573898977941e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8841763436794281, + "num_tokens": 224715326.0, + "step": 2101 + }, + { + "epoch": 4.789053591790194, + "grad_norm": 2.5, + "learning_rate": 2.882712759125664e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.8851722776889801, + "num_tokens": 224822759.0, + "step": 2102 + }, + { + "epoch": 4.79133409350057, + "grad_norm": 2.65625, + "learning_rate": 2.8808514020530127e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8818291127681732, + "num_tokens": 224929583.0, + "step": 2103 + }, + { + "epoch": 4.793614595210946, + "grad_norm": 3.859375, + "learning_rate": 2.8789898288164595e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8850146681070328, + "num_tokens": 225036349.0, + "step": 2104 + }, + { + "epoch": 4.795895096921322, + "grad_norm": 3.890625, + "learning_rate": 2.8771280404725953e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8813169300556183, + "num_tokens": 225142845.0, + "step": 2105 + }, + { + "epoch": 4.798175598631699, + "grad_norm": 5.875, + "learning_rate": 2.8752660380781367e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8808184266090393, + "num_tokens": 225250066.0, + "step": 2106 + }, + { + "epoch": 4.800456100342076, + "grad_norm": 4.0, + "learning_rate": 2.8734038226899198e-06, + "loss": 0.5559, + "mean_token_accuracy": 0.8856227844953537, + "num_tokens": 225356570.0, + "step": 2107 + }, + { + "epoch": 4.802736602052452, + "grad_norm": 2.8125, + "learning_rate": 2.8715413953649012e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8814336508512497, + "num_tokens": 225463833.0, + "step": 2108 + }, + { + "epoch": 4.805017103762828, + "grad_norm": 4.125, + "learning_rate": 2.8696787571601597e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8789616823196411, + "num_tokens": 225570947.0, + "step": 2109 + }, + { + "epoch": 4.807297605473204, + "grad_norm": 5.4375, + "learning_rate": 2.8678159091328926e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8813024163246155, + "num_tokens": 225678130.0, + "step": 2110 + }, + { + "epoch": 4.80957810718358, + "grad_norm": 2.8125, + "learning_rate": 2.865952852340417e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8798904716968536, + "num_tokens": 225785246.0, + "step": 2111 + }, + { + "epoch": 4.811858608893957, + "grad_norm": 6.53125, + "learning_rate": 2.864089587840167e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8791421502828598, + "num_tokens": 225891598.0, + "step": 2112 + }, + { + "epoch": 4.814139110604333, + "grad_norm": 3.25, + "learning_rate": 2.862226116689696e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8808872699737549, + "num_tokens": 225998427.0, + "step": 2113 + }, + { + "epoch": 4.816419612314709, + "grad_norm": 2.515625, + "learning_rate": 2.8603624399466732e-06, + "loss": 0.561, + "mean_token_accuracy": 0.8853624612092972, + "num_tokens": 226106239.0, + "step": 2114 + }, + { + "epoch": 4.818700114025085, + "grad_norm": 2.46875, + "learning_rate": 2.858498558668888e-06, + "loss": 0.549, + "mean_token_accuracy": 0.886398509144783, + "num_tokens": 226213512.0, + "step": 2115 + }, + { + "epoch": 4.820980615735461, + "grad_norm": 2.984375, + "learning_rate": 2.856634473914242e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8814459592103958, + "num_tokens": 226321236.0, + "step": 2116 + }, + { + "epoch": 4.823261117445838, + "grad_norm": 4.0625, + "learning_rate": 2.854770186740753e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8803770244121552, + "num_tokens": 226428875.0, + "step": 2117 + }, + { + "epoch": 4.825541619156215, + "grad_norm": 6.09375, + "learning_rate": 2.8529056982065557e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8832362145185471, + "num_tokens": 226535810.0, + "step": 2118 + }, + { + "epoch": 4.827822120866591, + "grad_norm": 5.875, + "learning_rate": 2.8510410093698966e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8776679933071136, + "num_tokens": 226642320.0, + "step": 2119 + }, + { + "epoch": 4.830102622576967, + "grad_norm": 6.03125, + "learning_rate": 2.849176121289138e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8798095434904099, + "num_tokens": 226750138.0, + "step": 2120 + }, + { + "epoch": 4.832383124287343, + "grad_norm": 2.765625, + "learning_rate": 2.8473110350227536e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8817552924156189, + "num_tokens": 226857421.0, + "step": 2121 + }, + { + "epoch": 4.834663625997719, + "grad_norm": 4.125, + "learning_rate": 2.845445751629331e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.8786935806274414, + "num_tokens": 226965168.0, + "step": 2122 + }, + { + "epoch": 4.836944127708096, + "grad_norm": 3.390625, + "learning_rate": 2.843580272167569e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8808483481407166, + "num_tokens": 227072404.0, + "step": 2123 + }, + { + "epoch": 4.839224629418472, + "grad_norm": 6.625, + "learning_rate": 2.8417145976962773e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8761008530855179, + "num_tokens": 227179076.0, + "step": 2124 + }, + { + "epoch": 4.841505131128848, + "grad_norm": 4.75, + "learning_rate": 2.8398487292743772e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8841915279626846, + "num_tokens": 227285768.0, + "step": 2125 + }, + { + "epoch": 4.843785632839225, + "grad_norm": 2.6875, + "learning_rate": 2.8379826679609e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8802362382411957, + "num_tokens": 227392431.0, + "step": 2126 + }, + { + "epoch": 4.846066134549601, + "grad_norm": 2.609375, + "learning_rate": 2.836116414814985e-06, + "loss": 0.6062, + "mean_token_accuracy": 0.8763725161552429, + "num_tokens": 227499046.0, + "step": 2127 + }, + { + "epoch": 4.848346636259977, + "grad_norm": 3.203125, + "learning_rate": 2.8342499708958827e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8813872933387756, + "num_tokens": 227606044.0, + "step": 2128 + }, + { + "epoch": 4.850627137970354, + "grad_norm": 3.546875, + "learning_rate": 2.8323833372629485e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8817006945610046, + "num_tokens": 227713533.0, + "step": 2129 + }, + { + "epoch": 4.85290763968073, + "grad_norm": 5.625, + "learning_rate": 2.8305165149756496e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.8857055157423019, + "num_tokens": 227820658.0, + "step": 2130 + }, + { + "epoch": 4.855188141391106, + "grad_norm": 3.25, + "learning_rate": 2.828649505093558e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8798857480287552, + "num_tokens": 227927701.0, + "step": 2131 + }, + { + "epoch": 4.857468643101482, + "grad_norm": 4.53125, + "learning_rate": 2.826782308676351e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.8805892914533615, + "num_tokens": 228034374.0, + "step": 2132 + }, + { + "epoch": 4.859749144811858, + "grad_norm": 2.75, + "learning_rate": 2.824914926783815e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8796881288290024, + "num_tokens": 228141770.0, + "step": 2133 + }, + { + "epoch": 4.862029646522235, + "grad_norm": 2.53125, + "learning_rate": 2.82304736047584e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8820720165967941, + "num_tokens": 228249025.0, + "step": 2134 + }, + { + "epoch": 4.864310148232612, + "grad_norm": 4.84375, + "learning_rate": 2.821179610812419e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.8788727074861526, + "num_tokens": 228356604.0, + "step": 2135 + }, + { + "epoch": 4.866590649942988, + "grad_norm": 6.71875, + "learning_rate": 2.819311678853652e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8829772472381592, + "num_tokens": 228464360.0, + "step": 2136 + }, + { + "epoch": 4.868871151653364, + "grad_norm": 3.046875, + "learning_rate": 2.8174435656597403e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8821916729211807, + "num_tokens": 228571499.0, + "step": 2137 + }, + { + "epoch": 4.87115165336374, + "grad_norm": 4.78125, + "learning_rate": 2.8155752722909896e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.8816975653171539, + "num_tokens": 228678610.0, + "step": 2138 + }, + { + "epoch": 4.873432155074116, + "grad_norm": 2.953125, + "learning_rate": 2.8137067998078073e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8804251700639725, + "num_tokens": 228785226.0, + "step": 2139 + }, + { + "epoch": 4.875712656784493, + "grad_norm": 2.421875, + "learning_rate": 2.8118381492707004e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.8774864822626114, + "num_tokens": 228891949.0, + "step": 2140 + }, + { + "epoch": 4.877993158494869, + "grad_norm": 3.6875, + "learning_rate": 2.8099693217402807e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8809685558080673, + "num_tokens": 228999525.0, + "step": 2141 + }, + { + "epoch": 4.880273660205245, + "grad_norm": 3.859375, + "learning_rate": 2.808100318277258e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8823101967573166, + "num_tokens": 229106700.0, + "step": 2142 + }, + { + "epoch": 4.882554161915621, + "grad_norm": 2.359375, + "learning_rate": 2.806231139942443e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8847738802433014, + "num_tokens": 229214479.0, + "step": 2143 + }, + { + "epoch": 4.884834663625997, + "grad_norm": 2.8125, + "learning_rate": 2.8043617877967456e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8798518478870392, + "num_tokens": 229321261.0, + "step": 2144 + }, + { + "epoch": 4.887115165336374, + "grad_norm": 3.390625, + "learning_rate": 2.8024922629011727e-06, + "loss": 0.571, + "mean_token_accuracy": 0.883740559220314, + "num_tokens": 229428653.0, + "step": 2145 + }, + { + "epoch": 4.889395667046751, + "grad_norm": 3.046875, + "learning_rate": 2.800622566316831e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8811280727386475, + "num_tokens": 229535592.0, + "step": 2146 + }, + { + "epoch": 4.891676168757127, + "grad_norm": 4.03125, + "learning_rate": 2.798752699104925e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8848204910755157, + "num_tokens": 229642187.0, + "step": 2147 + }, + { + "epoch": 4.893956670467503, + "grad_norm": 2.8125, + "learning_rate": 2.7968826623267542e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8824999332427979, + "num_tokens": 229749302.0, + "step": 2148 + }, + { + "epoch": 4.896237172177879, + "grad_norm": 2.8125, + "learning_rate": 2.7950124570437163e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8834424167871475, + "num_tokens": 229856402.0, + "step": 2149 + }, + { + "epoch": 4.898517673888255, + "grad_norm": 3.59375, + "learning_rate": 2.793142084317303e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8832953125238419, + "num_tokens": 229963106.0, + "step": 2150 + }, + { + "epoch": 4.900798175598632, + "grad_norm": 2.578125, + "learning_rate": 2.7912715452091014e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8826733380556107, + "num_tokens": 230070472.0, + "step": 2151 + }, + { + "epoch": 4.903078677309008, + "grad_norm": 3.109375, + "learning_rate": 2.789400840780795e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8755721002817154, + "num_tokens": 230177193.0, + "step": 2152 + }, + { + "epoch": 4.905359179019384, + "grad_norm": 3.40625, + "learning_rate": 2.7875299720941577e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8802897483110428, + "num_tokens": 230284630.0, + "step": 2153 + }, + { + "epoch": 4.90763968072976, + "grad_norm": 3.296875, + "learning_rate": 2.785658940211059e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8808315396308899, + "num_tokens": 230391476.0, + "step": 2154 + }, + { + "epoch": 4.909920182440137, + "grad_norm": 4.4375, + "learning_rate": 2.7837877461934616e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.881686270236969, + "num_tokens": 230498008.0, + "step": 2155 + }, + { + "epoch": 4.9122006841505135, + "grad_norm": 2.578125, + "learning_rate": 2.7819163911034175e-06, + "loss": 0.571, + "mean_token_accuracy": 0.882976695895195, + "num_tokens": 230605141.0, + "step": 2156 + }, + { + "epoch": 4.91448118586089, + "grad_norm": 4.46875, + "learning_rate": 2.7800448760030724e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8807835131883621, + "num_tokens": 230712400.0, + "step": 2157 + }, + { + "epoch": 4.916761687571266, + "grad_norm": 7.28125, + "learning_rate": 2.7781732019546625e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8794967532157898, + "num_tokens": 230819368.0, + "step": 2158 + }, + { + "epoch": 4.919042189281642, + "grad_norm": 2.71875, + "learning_rate": 2.776301370020513e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8819598108530045, + "num_tokens": 230925639.0, + "step": 2159 + }, + { + "epoch": 4.921322690992018, + "grad_norm": 2.765625, + "learning_rate": 2.7744293812630412e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.881621241569519, + "num_tokens": 231032712.0, + "step": 2160 + }, + { + "epoch": 4.923603192702394, + "grad_norm": 3.859375, + "learning_rate": 2.77255723674475e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.8840135484933853, + "num_tokens": 231139944.0, + "step": 2161 + }, + { + "epoch": 4.925883694412771, + "grad_norm": 3.28125, + "learning_rate": 2.770684937528233e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8798476308584213, + "num_tokens": 231246948.0, + "step": 2162 + }, + { + "epoch": 4.928164196123147, + "grad_norm": 2.921875, + "learning_rate": 2.7688124846761716e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8791171163320541, + "num_tokens": 231353810.0, + "step": 2163 + }, + { + "epoch": 4.930444697833523, + "grad_norm": 3.96875, + "learning_rate": 2.766939879251333e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.876966580748558, + "num_tokens": 231461394.0, + "step": 2164 + }, + { + "epoch": 4.932725199543899, + "grad_norm": 2.765625, + "learning_rate": 2.7650671223165726e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8889574855566025, + "num_tokens": 231568225.0, + "step": 2165 + }, + { + "epoch": 4.935005701254276, + "grad_norm": 3.0, + "learning_rate": 2.7631942149348313e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8834359645843506, + "num_tokens": 231675236.0, + "step": 2166 + }, + { + "epoch": 4.9372862029646525, + "grad_norm": 4.78125, + "learning_rate": 2.761321158169134e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8793386965990067, + "num_tokens": 231782454.0, + "step": 2167 + }, + { + "epoch": 4.939566704675029, + "grad_norm": 4.78125, + "learning_rate": 2.759447953082593e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.8823465257883072, + "num_tokens": 231889063.0, + "step": 2168 + }, + { + "epoch": 4.941847206385405, + "grad_norm": 3.015625, + "learning_rate": 2.757574600738402e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8793519288301468, + "num_tokens": 231995594.0, + "step": 2169 + }, + { + "epoch": 4.944127708095781, + "grad_norm": 4.78125, + "learning_rate": 2.755701102199841e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8815154582262039, + "num_tokens": 232102587.0, + "step": 2170 + }, + { + "epoch": 4.946408209806157, + "grad_norm": 3.28125, + "learning_rate": 2.7538274585302707e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8854823410511017, + "num_tokens": 232209760.0, + "step": 2171 + }, + { + "epoch": 4.9486887115165334, + "grad_norm": 3.34375, + "learning_rate": 2.751953670793135e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8862673789262772, + "num_tokens": 232316636.0, + "step": 2172 + }, + { + "epoch": 4.95096921322691, + "grad_norm": 3.1875, + "learning_rate": 2.7500797400519595e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8799717128276825, + "num_tokens": 232423639.0, + "step": 2173 + }, + { + "epoch": 4.953249714937286, + "grad_norm": 2.90625, + "learning_rate": 2.7482056673703526e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8792567402124405, + "num_tokens": 232530768.0, + "step": 2174 + }, + { + "epoch": 4.955530216647663, + "grad_norm": 4.53125, + "learning_rate": 2.746331453812e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8782864660024643, + "num_tokens": 232637783.0, + "step": 2175 + }, + { + "epoch": 4.957810718358039, + "grad_norm": 3.125, + "learning_rate": 2.74445710044067e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.8764726370573044, + "num_tokens": 232745202.0, + "step": 2176 + }, + { + "epoch": 4.960091220068415, + "grad_norm": 3.09375, + "learning_rate": 2.7425826083202096e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8824016451835632, + "num_tokens": 232852068.0, + "step": 2177 + }, + { + "epoch": 4.9623717217787915, + "grad_norm": 2.84375, + "learning_rate": 2.740707978514543e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8894577920436859, + "num_tokens": 232959452.0, + "step": 2178 + }, + { + "epoch": 4.964652223489168, + "grad_norm": 4.34375, + "learning_rate": 2.738833212087676e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8816526979207993, + "num_tokens": 233066740.0, + "step": 2179 + }, + { + "epoch": 4.966932725199544, + "grad_norm": 2.703125, + "learning_rate": 2.736958310103688e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8829785734415054, + "num_tokens": 233173753.0, + "step": 2180 + }, + { + "epoch": 4.96921322690992, + "grad_norm": 2.421875, + "learning_rate": 2.735083273626738e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8816793113946915, + "num_tokens": 233280384.0, + "step": 2181 + }, + { + "epoch": 4.971493728620296, + "grad_norm": 5.0, + "learning_rate": 2.7332081037210607e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.8826627433300018, + "num_tokens": 233388025.0, + "step": 2182 + }, + { + "epoch": 4.9737742303306725, + "grad_norm": 3.875, + "learning_rate": 2.7313328014509653e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8806509524583817, + "num_tokens": 233494917.0, + "step": 2183 + }, + { + "epoch": 4.976054732041049, + "grad_norm": 3.65625, + "learning_rate": 2.729457367880838e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.886359453201294, + "num_tokens": 233602067.0, + "step": 2184 + }, + { + "epoch": 4.978335233751425, + "grad_norm": 2.8125, + "learning_rate": 2.727581804075139e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8818333595991135, + "num_tokens": 233708533.0, + "step": 2185 + }, + { + "epoch": 4.980615735461802, + "grad_norm": 2.578125, + "learning_rate": 2.7257061110984005e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8846048712730408, + "num_tokens": 233815478.0, + "step": 2186 + }, + { + "epoch": 4.982896237172178, + "grad_norm": 2.53125, + "learning_rate": 2.7238302900152327e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8843528181314468, + "num_tokens": 233922328.0, + "step": 2187 + }, + { + "epoch": 4.985176738882554, + "grad_norm": 5.40625, + "learning_rate": 2.7219543418903115e-06, + "loss": 0.6009, + "mean_token_accuracy": 0.8763009756803513, + "num_tokens": 234028772.0, + "step": 2188 + }, + { + "epoch": 4.9874572405929305, + "grad_norm": 2.4375, + "learning_rate": 2.720078267788392e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8788257986307144, + "num_tokens": 234135491.0, + "step": 2189 + }, + { + "epoch": 4.989737742303307, + "grad_norm": 3.921875, + "learning_rate": 2.718202068774296e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8749925345182419, + "num_tokens": 234243399.0, + "step": 2190 + }, + { + "epoch": 4.992018244013683, + "grad_norm": 2.8125, + "learning_rate": 2.7163257459129184e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8842321634292603, + "num_tokens": 234350519.0, + "step": 2191 + }, + { + "epoch": 4.994298745724059, + "grad_norm": 3.484375, + "learning_rate": 2.7144493002692242e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8810799866914749, + "num_tokens": 234457691.0, + "step": 2192 + }, + { + "epoch": 4.996579247434435, + "grad_norm": 3.546875, + "learning_rate": 2.7125727329082474e-06, + "loss": 0.568, + "mean_token_accuracy": 0.8835117220878601, + "num_tokens": 234564624.0, + "step": 2193 + }, + { + "epoch": 4.9988597491448115, + "grad_norm": 4.625, + "learning_rate": 2.7106960448950904e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8846527189016342, + "num_tokens": 234671664.0, + "step": 2194 + }, + { + "epoch": 5.0, + "grad_norm": 3.8125, + "learning_rate": 2.7088192372949267e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.893031507730484, + "num_tokens": 234711160.0, + "step": 2195 + }, + { + "epoch": 5.002280501710376, + "grad_norm": 2.40625, + "learning_rate": 2.7069423111729948e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8820315301418304, + "num_tokens": 234818290.0, + "step": 2196 + }, + { + "epoch": 5.004561003420752, + "grad_norm": 2.625, + "learning_rate": 2.705065267594602e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8801742941141129, + "num_tokens": 234925842.0, + "step": 2197 + }, + { + "epoch": 5.006841505131129, + "grad_norm": 3.171875, + "learning_rate": 2.703188107625123e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8789117336273193, + "num_tokens": 235032605.0, + "step": 2198 + }, + { + "epoch": 5.009122006841505, + "grad_norm": 2.984375, + "learning_rate": 2.701310832329996e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8840703815221786, + "num_tokens": 235139505.0, + "step": 2199 + }, + { + "epoch": 5.011402508551882, + "grad_norm": 2.84375, + "learning_rate": 2.6994334427747276e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8826315701007843, + "num_tokens": 235246340.0, + "step": 2200 + }, + { + "epoch": 5.011402508551882, + "eval_loss": 0.5884966850280762, + "eval_mean_token_accuracy": 0.8795779687370184, + "eval_num_tokens": 235246340.0, + "eval_runtime": 58.673, + "eval_samples_per_second": 142.911, + "eval_steps_per_second": 4.482, + "step": 2200 + }, + { + "epoch": 5.013683010262258, + "grad_norm": 3.234375, + "learning_rate": 2.6975559400248876e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8832818120718002, + "num_tokens": 235354018.0, + "step": 2201 + }, + { + "epoch": 5.015963511972634, + "grad_norm": 3.625, + "learning_rate": 2.6956783251461093e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8836443275213242, + "num_tokens": 235461263.0, + "step": 2202 + }, + { + "epoch": 5.01824401368301, + "grad_norm": 5.09375, + "learning_rate": 2.6938005992040923e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.88588847219944, + "num_tokens": 235568391.0, + "step": 2203 + }, + { + "epoch": 5.020524515393387, + "grad_norm": 2.703125, + "learning_rate": 2.6919227632645963e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8812144100666046, + "num_tokens": 235675302.0, + "step": 2204 + }, + { + "epoch": 5.022805017103763, + "grad_norm": 5.46875, + "learning_rate": 2.690044818393444e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8809924572706223, + "num_tokens": 235781916.0, + "step": 2205 + }, + { + "epoch": 5.025085518814139, + "grad_norm": 2.65625, + "learning_rate": 2.688166765656523e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8800527006387711, + "num_tokens": 235889095.0, + "step": 2206 + }, + { + "epoch": 5.027366020524515, + "grad_norm": 2.65625, + "learning_rate": 2.686288606119778e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8826051503419876, + "num_tokens": 235995817.0, + "step": 2207 + }, + { + "epoch": 5.029646522234891, + "grad_norm": 2.828125, + "learning_rate": 2.6844103408492165e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8821920156478882, + "num_tokens": 236102620.0, + "step": 2208 + }, + { + "epoch": 5.031927023945268, + "grad_norm": 4.28125, + "learning_rate": 2.682531970910906e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.882752850651741, + "num_tokens": 236210097.0, + "step": 2209 + }, + { + "epoch": 5.034207525655645, + "grad_norm": 3.609375, + "learning_rate": 2.6806534973709723e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8835909813642502, + "num_tokens": 236317553.0, + "step": 2210 + }, + { + "epoch": 5.036488027366021, + "grad_norm": 2.9375, + "learning_rate": 2.6787749212956023e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.8868236839771271, + "num_tokens": 236424162.0, + "step": 2211 + }, + { + "epoch": 5.038768529076397, + "grad_norm": 3.90625, + "learning_rate": 2.676896243751037e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8755950033664703, + "num_tokens": 236531359.0, + "step": 2212 + }, + { + "epoch": 5.041049030786773, + "grad_norm": 2.1875, + "learning_rate": 2.6750174658035793e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8848745375871658, + "num_tokens": 236638705.0, + "step": 2213 + }, + { + "epoch": 5.043329532497149, + "grad_norm": 4.5, + "learning_rate": 2.673138588519587e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.8795148730278015, + "num_tokens": 236745788.0, + "step": 2214 + }, + { + "epoch": 5.045610034207526, + "grad_norm": 3.984375, + "learning_rate": 2.671259612965475e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8808691650629044, + "num_tokens": 236852575.0, + "step": 2215 + }, + { + "epoch": 5.047890535917902, + "grad_norm": 2.9375, + "learning_rate": 2.6693805402077123e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8838084042072296, + "num_tokens": 236959458.0, + "step": 2216 + }, + { + "epoch": 5.050171037628278, + "grad_norm": 2.953125, + "learning_rate": 2.6675013713128252e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8833956867456436, + "num_tokens": 237067046.0, + "step": 2217 + }, + { + "epoch": 5.052451539338654, + "grad_norm": 3.0, + "learning_rate": 2.665622107347393e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8833256661891937, + "num_tokens": 237174163.0, + "step": 2218 + }, + { + "epoch": 5.05473204104903, + "grad_norm": 3.046875, + "learning_rate": 2.6637427493780503e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.8838570863008499, + "num_tokens": 237280734.0, + "step": 2219 + }, + { + "epoch": 5.0570125427594075, + "grad_norm": 5.21875, + "learning_rate": 2.6618632984714843e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8816571533679962, + "num_tokens": 237388128.0, + "step": 2220 + }, + { + "epoch": 5.059293044469784, + "grad_norm": 2.671875, + "learning_rate": 2.6599837556944353e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.8754912316799164, + "num_tokens": 237494752.0, + "step": 2221 + }, + { + "epoch": 5.06157354618016, + "grad_norm": 4.53125, + "learning_rate": 2.658104122113695e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8795156031847, + "num_tokens": 237601646.0, + "step": 2222 + }, + { + "epoch": 5.063854047890536, + "grad_norm": 2.6875, + "learning_rate": 2.6562243987961066e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.889316976070404, + "num_tokens": 237709147.0, + "step": 2223 + }, + { + "epoch": 5.066134549600912, + "grad_norm": 3.90625, + "learning_rate": 2.6543445868085665e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8827229887247086, + "num_tokens": 237816406.0, + "step": 2224 + }, + { + "epoch": 5.068415051311288, + "grad_norm": 2.796875, + "learning_rate": 2.652464687218018e-06, + "loss": 0.5996, + "mean_token_accuracy": 0.8769262731075287, + "num_tokens": 237923355.0, + "step": 2225 + }, + { + "epoch": 5.070695553021665, + "grad_norm": 3.921875, + "learning_rate": 2.6505847010914575e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8756005167961121, + "num_tokens": 238030151.0, + "step": 2226 + }, + { + "epoch": 5.072976054732041, + "grad_norm": 4.34375, + "learning_rate": 2.6487046294959275e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8792047202587128, + "num_tokens": 238137145.0, + "step": 2227 + }, + { + "epoch": 5.075256556442417, + "grad_norm": 3.09375, + "learning_rate": 2.64682447349852e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8841764777898788, + "num_tokens": 238243562.0, + "step": 2228 + }, + { + "epoch": 5.077537058152793, + "grad_norm": 3.375, + "learning_rate": 2.6449442341663755e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.8807887881994247, + "num_tokens": 238350893.0, + "step": 2229 + }, + { + "epoch": 5.07981755986317, + "grad_norm": 3.5, + "learning_rate": 2.643063912566683e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8790174126625061, + "num_tokens": 238457625.0, + "step": 2230 + }, + { + "epoch": 5.0820980615735465, + "grad_norm": 3.390625, + "learning_rate": 2.641183509766675e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8864348530769348, + "num_tokens": 238564821.0, + "step": 2231 + }, + { + "epoch": 5.084378563283923, + "grad_norm": 3.859375, + "learning_rate": 2.639303026833632e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8856208771467209, + "num_tokens": 238671917.0, + "step": 2232 + }, + { + "epoch": 5.086659064994299, + "grad_norm": 5.46875, + "learning_rate": 2.6374224648348815e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8855437189340591, + "num_tokens": 238779139.0, + "step": 2233 + }, + { + "epoch": 5.088939566704675, + "grad_norm": 4.53125, + "learning_rate": 2.6355418248377928e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8826903700828552, + "num_tokens": 238886730.0, + "step": 2234 + }, + { + "epoch": 5.091220068415051, + "grad_norm": 2.640625, + "learning_rate": 2.633661107909781e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.8886370956897736, + "num_tokens": 238993760.0, + "step": 2235 + }, + { + "epoch": 5.0935005701254275, + "grad_norm": 2.515625, + "learning_rate": 2.6317803151183053e-06, + "loss": 0.5576, + "mean_token_accuracy": 0.8850623667240143, + "num_tokens": 239100806.0, + "step": 2236 + }, + { + "epoch": 5.095781071835804, + "grad_norm": 2.625, + "learning_rate": 2.629899447530866e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8851571381092072, + "num_tokens": 239209162.0, + "step": 2237 + }, + { + "epoch": 5.09806157354618, + "grad_norm": 4.15625, + "learning_rate": 2.6280185062150084e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8866082727909088, + "num_tokens": 239316377.0, + "step": 2238 + }, + { + "epoch": 5.100342075256556, + "grad_norm": 3.84375, + "learning_rate": 2.6261374922383176e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8804745078086853, + "num_tokens": 239423635.0, + "step": 2239 + }, + { + "epoch": 5.102622576966933, + "grad_norm": 3.921875, + "learning_rate": 2.6242564066684217e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.8799638897180557, + "num_tokens": 239530264.0, + "step": 2240 + }, + { + "epoch": 5.104903078677309, + "grad_norm": 4.0625, + "learning_rate": 2.6223752505729884e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8811380565166473, + "num_tokens": 239637865.0, + "step": 2241 + }, + { + "epoch": 5.1071835803876855, + "grad_norm": 3.15625, + "learning_rate": 2.6204940250197253e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8820418566465378, + "num_tokens": 239744908.0, + "step": 2242 + }, + { + "epoch": 5.109464082098062, + "grad_norm": 2.84375, + "learning_rate": 2.61861273107638e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8828686624765396, + "num_tokens": 239852149.0, + "step": 2243 + }, + { + "epoch": 5.111744583808438, + "grad_norm": 6.84375, + "learning_rate": 2.6167313698107385e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.880137637257576, + "num_tokens": 239959160.0, + "step": 2244 + }, + { + "epoch": 5.114025085518814, + "grad_norm": 4.15625, + "learning_rate": 2.6148499422906243e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8834390491247177, + "num_tokens": 240066499.0, + "step": 2245 + }, + { + "epoch": 5.11630558722919, + "grad_norm": 4.4375, + "learning_rate": 2.6129684495839013e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8807038068771362, + "num_tokens": 240173494.0, + "step": 2246 + }, + { + "epoch": 5.1185860889395665, + "grad_norm": 4.5, + "learning_rate": 2.611086892758467e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.883940801024437, + "num_tokens": 240280234.0, + "step": 2247 + }, + { + "epoch": 5.120866590649943, + "grad_norm": 3.171875, + "learning_rate": 2.6092052728822564e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8823877573013306, + "num_tokens": 240387350.0, + "step": 2248 + }, + { + "epoch": 5.123147092360319, + "grad_norm": 3.53125, + "learning_rate": 2.607323591023242e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8841145634651184, + "num_tokens": 240494417.0, + "step": 2249 + }, + { + "epoch": 5.125427594070696, + "grad_norm": 2.734375, + "learning_rate": 2.605441848249428e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8849078863859177, + "num_tokens": 240601696.0, + "step": 2250 + }, + { + "epoch": 5.127708095781072, + "grad_norm": 6.34375, + "learning_rate": 2.6035600456288573e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.8804211169481277, + "num_tokens": 240708216.0, + "step": 2251 + }, + { + "epoch": 5.129988597491448, + "grad_norm": 4.34375, + "learning_rate": 2.6016781842296044e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.881677657365799, + "num_tokens": 240814705.0, + "step": 2252 + }, + { + "epoch": 5.1322690992018245, + "grad_norm": 3.296875, + "learning_rate": 2.599796265119777e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.881735309958458, + "num_tokens": 240921724.0, + "step": 2253 + }, + { + "epoch": 5.134549600912201, + "grad_norm": 4.46875, + "learning_rate": 2.597914289367516e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8814171552658081, + "num_tokens": 241029263.0, + "step": 2254 + }, + { + "epoch": 5.136830102622577, + "grad_norm": 4.28125, + "learning_rate": 2.596032258040994e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8828539550304413, + "num_tokens": 241136198.0, + "step": 2255 + }, + { + "epoch": 5.139110604332953, + "grad_norm": 3.375, + "learning_rate": 2.594150172208417e-06, + "loss": 0.5987, + "mean_token_accuracy": 0.872844010591507, + "num_tokens": 241243108.0, + "step": 2256 + }, + { + "epoch": 5.141391106043329, + "grad_norm": 3.921875, + "learning_rate": 2.59226803293802e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.881134495139122, + "num_tokens": 241350038.0, + "step": 2257 + }, + { + "epoch": 5.1436716077537055, + "grad_norm": 3.484375, + "learning_rate": 2.5903858412980688e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.882486030459404, + "num_tokens": 241456705.0, + "step": 2258 + }, + { + "epoch": 5.145952109464082, + "grad_norm": 3.21875, + "learning_rate": 2.5885035983568584e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8809113800525665, + "num_tokens": 241564205.0, + "step": 2259 + }, + { + "epoch": 5.148232611174459, + "grad_norm": 8.3125, + "learning_rate": 2.5866213051827148e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8836153000593185, + "num_tokens": 241671302.0, + "step": 2260 + }, + { + "epoch": 5.150513112884835, + "grad_norm": 4.96875, + "learning_rate": 2.5847389628439905e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8828669935464859, + "num_tokens": 241778608.0, + "step": 2261 + }, + { + "epoch": 5.152793614595211, + "grad_norm": 6.4375, + "learning_rate": 2.5828565724090672e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.881557047367096, + "num_tokens": 241886293.0, + "step": 2262 + }, + { + "epoch": 5.155074116305587, + "grad_norm": 4.25, + "learning_rate": 2.5809741349463526e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.882814958691597, + "num_tokens": 241993190.0, + "step": 2263 + }, + { + "epoch": 5.1573546180159635, + "grad_norm": 3.59375, + "learning_rate": 2.579091651524282e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8815194815397263, + "num_tokens": 242100380.0, + "step": 2264 + }, + { + "epoch": 5.15963511972634, + "grad_norm": 3.484375, + "learning_rate": 2.5772091232113176e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.8794299364089966, + "num_tokens": 242207689.0, + "step": 2265 + }, + { + "epoch": 5.161915621436716, + "grad_norm": 4.28125, + "learning_rate": 2.575326551075945e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8856062591075897, + "num_tokens": 242314652.0, + "step": 2266 + }, + { + "epoch": 5.164196123147092, + "grad_norm": 3.203125, + "learning_rate": 2.5734439361866762e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.8838386088609695, + "num_tokens": 242422494.0, + "step": 2267 + }, + { + "epoch": 5.166476624857468, + "grad_norm": 5.875, + "learning_rate": 2.571561279612047e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8841647505760193, + "num_tokens": 242529583.0, + "step": 2268 + }, + { + "epoch": 5.168757126567845, + "grad_norm": 5.1875, + "learning_rate": 2.5696785824206177e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8775971680879593, + "num_tokens": 242636422.0, + "step": 2269 + }, + { + "epoch": 5.1710376282782216, + "grad_norm": 5.15625, + "learning_rate": 2.5677958456809703e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8794868886470795, + "num_tokens": 242742862.0, + "step": 2270 + }, + { + "epoch": 5.173318129988598, + "grad_norm": 3.59375, + "learning_rate": 2.5659130704617092e-06, + "loss": 0.579, + "mean_token_accuracy": 0.8852169960737228, + "num_tokens": 242850283.0, + "step": 2271 + }, + { + "epoch": 5.175598631698974, + "grad_norm": 2.703125, + "learning_rate": 2.5640302578314614e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.8777539879083633, + "num_tokens": 242957389.0, + "step": 2272 + }, + { + "epoch": 5.17787913340935, + "grad_norm": 2.546875, + "learning_rate": 2.562147408858876e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.8816378861665726, + "num_tokens": 243064479.0, + "step": 2273 + }, + { + "epoch": 5.180159635119726, + "grad_norm": 2.796875, + "learning_rate": 2.5602645246126207e-06, + "loss": 0.5922, + "mean_token_accuracy": 0.877925843000412, + "num_tokens": 243171634.0, + "step": 2274 + }, + { + "epoch": 5.1824401368301025, + "grad_norm": 3.125, + "learning_rate": 2.5583816061613847e-06, + "loss": 0.566, + "mean_token_accuracy": 0.885333925485611, + "num_tokens": 243278723.0, + "step": 2275 + }, + { + "epoch": 5.184720638540479, + "grad_norm": 2.296875, + "learning_rate": 2.5564986545738767e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8853138238191605, + "num_tokens": 243386191.0, + "step": 2276 + }, + { + "epoch": 5.187001140250855, + "grad_norm": 2.6875, + "learning_rate": 2.554615670918823e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.8801163583993912, + "num_tokens": 243492478.0, + "step": 2277 + }, + { + "epoch": 5.189281641961231, + "grad_norm": 3.0, + "learning_rate": 2.552732656264969e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8835473358631134, + "num_tokens": 243599878.0, + "step": 2278 + }, + { + "epoch": 5.191562143671608, + "grad_norm": 3.84375, + "learning_rate": 2.5508496116810766e-06, + "loss": 0.6136, + "mean_token_accuracy": 0.8747565001249313, + "num_tokens": 243706837.0, + "step": 2279 + }, + { + "epoch": 5.193842645381984, + "grad_norm": 3.1875, + "learning_rate": 2.548966538235927e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8776230216026306, + "num_tokens": 243813746.0, + "step": 2280 + }, + { + "epoch": 5.196123147092361, + "grad_norm": 4.34375, + "learning_rate": 2.547083436998316e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8837112933397293, + "num_tokens": 243920433.0, + "step": 2281 + }, + { + "epoch": 5.198403648802737, + "grad_norm": 3.234375, + "learning_rate": 2.5452003090370543e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8835209310054779, + "num_tokens": 244027394.0, + "step": 2282 + }, + { + "epoch": 5.200684150513113, + "grad_norm": 3.4375, + "learning_rate": 2.5433171554209694e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8822397440671921, + "num_tokens": 244134310.0, + "step": 2283 + }, + { + "epoch": 5.202964652223489, + "grad_norm": 3.1875, + "learning_rate": 2.5414339772189045e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.8861335813999176, + "num_tokens": 244241565.0, + "step": 2284 + }, + { + "epoch": 5.205245153933865, + "grad_norm": 3.90625, + "learning_rate": 2.5395507754997135e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8825538158416748, + "num_tokens": 244349038.0, + "step": 2285 + }, + { + "epoch": 5.2075256556442415, + "grad_norm": 2.375, + "learning_rate": 2.5376675513322665e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8824332058429718, + "num_tokens": 244455939.0, + "step": 2286 + }, + { + "epoch": 5.209806157354618, + "grad_norm": 3.265625, + "learning_rate": 2.535784305785443e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.8791725635528564, + "num_tokens": 244563202.0, + "step": 2287 + }, + { + "epoch": 5.212086659064994, + "grad_norm": 2.71875, + "learning_rate": 2.5339010399281394e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8839509189128876, + "num_tokens": 244670656.0, + "step": 2288 + }, + { + "epoch": 5.214367160775371, + "grad_norm": 2.859375, + "learning_rate": 2.53201775482926e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8803385496139526, + "num_tokens": 244777602.0, + "step": 2289 + }, + { + "epoch": 5.216647662485747, + "grad_norm": 3.28125, + "learning_rate": 2.530134451557722e-06, + "loss": 0.5667, + "mean_token_accuracy": 0.8833064287900925, + "num_tokens": 244884757.0, + "step": 2290 + }, + { + "epoch": 5.218928164196123, + "grad_norm": 2.859375, + "learning_rate": 2.52825113118245e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8829501569271088, + "num_tokens": 244992076.0, + "step": 2291 + }, + { + "epoch": 5.2212086659065, + "grad_norm": 4.15625, + "learning_rate": 2.5263677947723813e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.8804136514663696, + "num_tokens": 245099111.0, + "step": 2292 + }, + { + "epoch": 5.223489167616876, + "grad_norm": 3.484375, + "learning_rate": 2.5244844433964615e-06, + "loss": 0.5481, + "mean_token_accuracy": 0.8848851323127747, + "num_tokens": 245206635.0, + "step": 2293 + }, + { + "epoch": 5.225769669327252, + "grad_norm": 2.484375, + "learning_rate": 2.522601078123645e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8856381624937057, + "num_tokens": 245313995.0, + "step": 2294 + }, + { + "epoch": 5.228050171037628, + "grad_norm": 2.84375, + "learning_rate": 2.5207177000228916e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8863477110862732, + "num_tokens": 245420590.0, + "step": 2295 + }, + { + "epoch": 5.230330672748004, + "grad_norm": 2.90625, + "learning_rate": 2.5188343101631717e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8750515878200531, + "num_tokens": 245527620.0, + "step": 2296 + }, + { + "epoch": 5.2326111744583805, + "grad_norm": 4.84375, + "learning_rate": 2.516950909613462e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.8836745172739029, + "num_tokens": 245634856.0, + "step": 2297 + }, + { + "epoch": 5.234891676168757, + "grad_norm": 2.65625, + "learning_rate": 2.5150674994427427e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8803335726261139, + "num_tokens": 245741637.0, + "step": 2298 + }, + { + "epoch": 5.237172177879134, + "grad_norm": 6.40625, + "learning_rate": 2.5131840807200015e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.8841415345668793, + "num_tokens": 245850208.0, + "step": 2299 + }, + { + "epoch": 5.23945267958951, + "grad_norm": 2.796875, + "learning_rate": 2.511300654514231e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.8761951625347137, + "num_tokens": 245956827.0, + "step": 2300 + }, + { + "epoch": 5.241733181299886, + "grad_norm": 2.578125, + "learning_rate": 2.5094172218944276e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8835689127445221, + "num_tokens": 246064233.0, + "step": 2301 + }, + { + "epoch": 5.244013683010262, + "grad_norm": 2.5625, + "learning_rate": 2.5075337839295903e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.881672739982605, + "num_tokens": 246171103.0, + "step": 2302 + }, + { + "epoch": 5.246294184720639, + "grad_norm": 6.4375, + "learning_rate": 2.5056503416887222e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8831684589385986, + "num_tokens": 246278163.0, + "step": 2303 + }, + { + "epoch": 5.248574686431015, + "grad_norm": 3.375, + "learning_rate": 2.5037668962408295e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.8839241862297058, + "num_tokens": 246385460.0, + "step": 2304 + }, + { + "epoch": 5.250855188141391, + "grad_norm": 2.828125, + "learning_rate": 2.5018834486549198e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8784928619861603, + "num_tokens": 246492696.0, + "step": 2305 + }, + { + "epoch": 5.253135689851767, + "grad_norm": 3.1875, + "learning_rate": 2.5e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.884750634431839, + "num_tokens": 246599773.0, + "step": 2306 + }, + { + "epoch": 5.255416191562143, + "grad_norm": 3.6875, + "learning_rate": 2.4981165513450807e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8797119110822678, + "num_tokens": 246706234.0, + "step": 2307 + }, + { + "epoch": 5.2576966932725195, + "grad_norm": 4.09375, + "learning_rate": 2.4962331037591705e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8851300626993179, + "num_tokens": 246813408.0, + "step": 2308 + }, + { + "epoch": 5.259977194982897, + "grad_norm": 4.53125, + "learning_rate": 2.494349658311279e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.8815591186285019, + "num_tokens": 246920031.0, + "step": 2309 + }, + { + "epoch": 5.262257696693273, + "grad_norm": 5.46875, + "learning_rate": 2.492466216070411e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8796799033880234, + "num_tokens": 247027919.0, + "step": 2310 + }, + { + "epoch": 5.264538198403649, + "grad_norm": 3.390625, + "learning_rate": 2.4905827781055733e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8780756443738937, + "num_tokens": 247134526.0, + "step": 2311 + }, + { + "epoch": 5.266818700114025, + "grad_norm": 2.765625, + "learning_rate": 2.4886993454857696e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8812786191701889, + "num_tokens": 247242308.0, + "step": 2312 + }, + { + "epoch": 5.269099201824401, + "grad_norm": 2.875, + "learning_rate": 2.486815919279999e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8846468031406403, + "num_tokens": 247349006.0, + "step": 2313 + }, + { + "epoch": 5.271379703534778, + "grad_norm": 4.03125, + "learning_rate": 2.4849325005572573e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8841832727193832, + "num_tokens": 247455756.0, + "step": 2314 + }, + { + "epoch": 5.273660205245154, + "grad_norm": 3.234375, + "learning_rate": 2.483049090386539e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8810638189315796, + "num_tokens": 247562448.0, + "step": 2315 + }, + { + "epoch": 5.27594070695553, + "grad_norm": 5.25, + "learning_rate": 2.4811656898368287e-06, + "loss": 0.5921, + "mean_token_accuracy": 0.87621209025383, + "num_tokens": 247669493.0, + "step": 2316 + }, + { + "epoch": 5.278221208665906, + "grad_norm": 2.640625, + "learning_rate": 2.4792822999771092e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8810609132051468, + "num_tokens": 247776380.0, + "step": 2317 + }, + { + "epoch": 5.280501710376283, + "grad_norm": 2.9375, + "learning_rate": 2.477398921876356e-06, + "loss": 0.5727, + "mean_token_accuracy": 0.8778067380189896, + "num_tokens": 247883246.0, + "step": 2318 + }, + { + "epoch": 5.282782212086659, + "grad_norm": 4.5, + "learning_rate": 2.475515556603539e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.882491260766983, + "num_tokens": 247990244.0, + "step": 2319 + }, + { + "epoch": 5.285062713797036, + "grad_norm": 5.875, + "learning_rate": 2.47363220522762e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.8826608210802078, + "num_tokens": 248097490.0, + "step": 2320 + }, + { + "epoch": 5.287343215507412, + "grad_norm": 4.40625, + "learning_rate": 2.4717488688175513e-06, + "loss": 0.576, + "mean_token_accuracy": 0.8805565237998962, + "num_tokens": 248204076.0, + "step": 2321 + }, + { + "epoch": 5.289623717217788, + "grad_norm": 2.75, + "learning_rate": 2.469865548442279e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8818432092666626, + "num_tokens": 248311212.0, + "step": 2322 + }, + { + "epoch": 5.291904218928164, + "grad_norm": 3.453125, + "learning_rate": 2.4679822451707404e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8799884766340256, + "num_tokens": 248418165.0, + "step": 2323 + }, + { + "epoch": 5.29418472063854, + "grad_norm": 3.015625, + "learning_rate": 2.4660989600718606e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8832896500825882, + "num_tokens": 248525308.0, + "step": 2324 + }, + { + "epoch": 5.296465222348917, + "grad_norm": 3.484375, + "learning_rate": 2.4642156942145577e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8862069100141525, + "num_tokens": 248632300.0, + "step": 2325 + }, + { + "epoch": 5.298745724059293, + "grad_norm": 3.28125, + "learning_rate": 2.4623324486677352e-06, + "loss": 0.5821, + "mean_token_accuracy": 0.8789513111114502, + "num_tokens": 248739272.0, + "step": 2326 + }, + { + "epoch": 5.301026225769669, + "grad_norm": 2.734375, + "learning_rate": 2.4604492245002873e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.881064772605896, + "num_tokens": 248846325.0, + "step": 2327 + }, + { + "epoch": 5.303306727480045, + "grad_norm": 2.453125, + "learning_rate": 2.4585660227810963e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8820571899414062, + "num_tokens": 248953637.0, + "step": 2328 + }, + { + "epoch": 5.305587229190422, + "grad_norm": 3.21875, + "learning_rate": 2.4566828445790306e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.8833505213260651, + "num_tokens": 249060027.0, + "step": 2329 + }, + { + "epoch": 5.307867730900798, + "grad_norm": 2.984375, + "learning_rate": 2.454799690962946e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8810784071683884, + "num_tokens": 249166827.0, + "step": 2330 + }, + { + "epoch": 5.310148232611175, + "grad_norm": 2.96875, + "learning_rate": 2.4529165630016855e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8803735673427582, + "num_tokens": 249273474.0, + "step": 2331 + }, + { + "epoch": 5.312428734321551, + "grad_norm": 3.5625, + "learning_rate": 2.4510334617640733e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8823344260454178, + "num_tokens": 249380314.0, + "step": 2332 + }, + { + "epoch": 5.314709236031927, + "grad_norm": 3.046875, + "learning_rate": 2.4491503883189242e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8847284018993378, + "num_tokens": 249487519.0, + "step": 2333 + }, + { + "epoch": 5.316989737742303, + "grad_norm": 4.625, + "learning_rate": 2.447267343735032e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.8809306025505066, + "num_tokens": 249594226.0, + "step": 2334 + }, + { + "epoch": 5.319270239452679, + "grad_norm": 2.890625, + "learning_rate": 2.4453843290811772e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.8837304264307022, + "num_tokens": 249701458.0, + "step": 2335 + }, + { + "epoch": 5.321550741163056, + "grad_norm": 3.8125, + "learning_rate": 2.4435013454261246e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8787836730480194, + "num_tokens": 249807967.0, + "step": 2336 + }, + { + "epoch": 5.323831242873432, + "grad_norm": 3.625, + "learning_rate": 2.4416183938386157e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8804430663585663, + "num_tokens": 249915439.0, + "step": 2337 + }, + { + "epoch": 5.326111744583809, + "grad_norm": 2.875, + "learning_rate": 2.4397354753873797e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8802332282066345, + "num_tokens": 250022617.0, + "step": 2338 + }, + { + "epoch": 5.328392246294185, + "grad_norm": 4.28125, + "learning_rate": 2.4378525911411246e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8810295462608337, + "num_tokens": 250129347.0, + "step": 2339 + }, + { + "epoch": 5.330672748004561, + "grad_norm": 4.34375, + "learning_rate": 2.435969742168539e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.8794845640659332, + "num_tokens": 250236273.0, + "step": 2340 + }, + { + "epoch": 5.3329532497149374, + "grad_norm": 3.75, + "learning_rate": 2.4340869295382924e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8810597956180573, + "num_tokens": 250343235.0, + "step": 2341 + }, + { + "epoch": 5.335233751425314, + "grad_norm": 5.25, + "learning_rate": 2.432204154319031e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8840866684913635, + "num_tokens": 250450686.0, + "step": 2342 + }, + { + "epoch": 5.33751425313569, + "grad_norm": 3.0, + "learning_rate": 2.4303214175793827e-06, + "loss": 0.599, + "mean_token_accuracy": 0.8780745565891266, + "num_tokens": 250557785.0, + "step": 2343 + }, + { + "epoch": 5.339794754846066, + "grad_norm": 5.71875, + "learning_rate": 2.4284387203879536e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8825893849134445, + "num_tokens": 250665664.0, + "step": 2344 + }, + { + "epoch": 5.342075256556442, + "grad_norm": 5.09375, + "learning_rate": 2.426556063813324e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8859640210866928, + "num_tokens": 250773371.0, + "step": 2345 + }, + { + "epoch": 5.344355758266818, + "grad_norm": 5.15625, + "learning_rate": 2.4246734489240554e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.883133664727211, + "num_tokens": 250880142.0, + "step": 2346 + }, + { + "epoch": 5.346636259977195, + "grad_norm": 5.15625, + "learning_rate": 2.4227908767886837e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8815203011035919, + "num_tokens": 250987935.0, + "step": 2347 + }, + { + "epoch": 5.348916761687571, + "grad_norm": 4.21875, + "learning_rate": 2.420908348475719e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8813404738903046, + "num_tokens": 251094697.0, + "step": 2348 + }, + { + "epoch": 5.351197263397948, + "grad_norm": 4.9375, + "learning_rate": 2.4190258650536483e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.883138045668602, + "num_tokens": 251201631.0, + "step": 2349 + }, + { + "epoch": 5.353477765108324, + "grad_norm": 2.421875, + "learning_rate": 2.417143427590933e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8879837691783905, + "num_tokens": 251309130.0, + "step": 2350 + }, + { + "epoch": 5.3557582668187, + "grad_norm": 3.40625, + "learning_rate": 2.4152610371560095e-06, + "loss": 0.567, + "mean_token_accuracy": 0.886492133140564, + "num_tokens": 251416268.0, + "step": 2351 + }, + { + "epoch": 5.3580387685290765, + "grad_norm": 7.125, + "learning_rate": 2.413378694817286e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.8863291144371033, + "num_tokens": 251523690.0, + "step": 2352 + }, + { + "epoch": 5.360319270239453, + "grad_norm": 6.96875, + "learning_rate": 2.411496401643142e-06, + "loss": 0.59, + "mean_token_accuracy": 0.8787893354892731, + "num_tokens": 251630852.0, + "step": 2353 + }, + { + "epoch": 5.362599771949829, + "grad_norm": 4.25, + "learning_rate": 2.409614158701932e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.888729602098465, + "num_tokens": 251738309.0, + "step": 2354 + }, + { + "epoch": 5.364880273660205, + "grad_norm": 5.0, + "learning_rate": 2.407731967061981e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8860864639282227, + "num_tokens": 251845628.0, + "step": 2355 + }, + { + "epoch": 5.367160775370581, + "grad_norm": 5.15625, + "learning_rate": 2.4058498277915835e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8805464655160904, + "num_tokens": 251951993.0, + "step": 2356 + }, + { + "epoch": 5.369441277080957, + "grad_norm": 3.453125, + "learning_rate": 2.4039677419590064e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8828859180212021, + "num_tokens": 252059284.0, + "step": 2357 + }, + { + "epoch": 5.3717217787913345, + "grad_norm": 3.46875, + "learning_rate": 2.4020857106324853e-06, + "loss": 0.6042, + "mean_token_accuracy": 0.872930184006691, + "num_tokens": 252166320.0, + "step": 2358 + }, + { + "epoch": 5.374002280501711, + "grad_norm": 4.375, + "learning_rate": 2.4002037348802245e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8799397349357605, + "num_tokens": 252273488.0, + "step": 2359 + }, + { + "epoch": 5.376282782212087, + "grad_norm": 6.0625, + "learning_rate": 2.3983218157703964e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8736053854227066, + "num_tokens": 252380458.0, + "step": 2360 + }, + { + "epoch": 5.378563283922463, + "grad_norm": 5.28125, + "learning_rate": 2.3964399543711427e-06, + "loss": 0.59, + "mean_token_accuracy": 0.8807500749826431, + "num_tokens": 252487767.0, + "step": 2361 + }, + { + "epoch": 5.380843785632839, + "grad_norm": 5.84375, + "learning_rate": 2.394558151750572e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8786357641220093, + "num_tokens": 252595094.0, + "step": 2362 + }, + { + "epoch": 5.3831242873432155, + "grad_norm": 5.84375, + "learning_rate": 2.3926764089767594e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8844779282808304, + "num_tokens": 252702007.0, + "step": 2363 + }, + { + "epoch": 5.385404789053592, + "grad_norm": 6.125, + "learning_rate": 2.3907947271177444e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8814315497875214, + "num_tokens": 252808779.0, + "step": 2364 + }, + { + "epoch": 5.387685290763968, + "grad_norm": 2.703125, + "learning_rate": 2.388913107241534e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8800296932458878, + "num_tokens": 252915277.0, + "step": 2365 + }, + { + "epoch": 5.389965792474344, + "grad_norm": 3.375, + "learning_rate": 2.3870315504160995e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8823367357254028, + "num_tokens": 253022601.0, + "step": 2366 + }, + { + "epoch": 5.39224629418472, + "grad_norm": 2.703125, + "learning_rate": 2.3851500577093757e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8841412514448166, + "num_tokens": 253129731.0, + "step": 2367 + }, + { + "epoch": 5.394526795895097, + "grad_norm": 4.8125, + "learning_rate": 2.3832686301892628e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8834680169820786, + "num_tokens": 253236662.0, + "step": 2368 + }, + { + "epoch": 5.3968072976054735, + "grad_norm": 4.3125, + "learning_rate": 2.381387268923621e-06, + "loss": 0.594, + "mean_token_accuracy": 0.8795076310634613, + "num_tokens": 253343900.0, + "step": 2369 + }, + { + "epoch": 5.39908779931585, + "grad_norm": 3.5625, + "learning_rate": 2.3795059749802756e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8820982426404953, + "num_tokens": 253450924.0, + "step": 2370 + }, + { + "epoch": 5.401368301026226, + "grad_norm": 3.515625, + "learning_rate": 2.377624749427012e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.884630486369133, + "num_tokens": 253558210.0, + "step": 2371 + }, + { + "epoch": 5.403648802736602, + "grad_norm": 4.84375, + "learning_rate": 2.3757435933315787e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.878990963101387, + "num_tokens": 253665052.0, + "step": 2372 + }, + { + "epoch": 5.405929304446978, + "grad_norm": 5.03125, + "learning_rate": 2.3738625077616837e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8815456032752991, + "num_tokens": 253772336.0, + "step": 2373 + }, + { + "epoch": 5.4082098061573545, + "grad_norm": 3.046875, + "learning_rate": 2.371981493784993e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8819355368614197, + "num_tokens": 253878690.0, + "step": 2374 + }, + { + "epoch": 5.410490307867731, + "grad_norm": 2.953125, + "learning_rate": 2.370100552469135e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.8823709636926651, + "num_tokens": 253985411.0, + "step": 2375 + }, + { + "epoch": 5.412770809578107, + "grad_norm": 4.6875, + "learning_rate": 2.3682196848816955e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.881668359041214, + "num_tokens": 254092624.0, + "step": 2376 + }, + { + "epoch": 5.415051311288483, + "grad_norm": 2.765625, + "learning_rate": 2.3663388920902198e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8824621737003326, + "num_tokens": 254199578.0, + "step": 2377 + }, + { + "epoch": 5.41733181299886, + "grad_norm": 4.6875, + "learning_rate": 2.3644581751622076e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8811022043228149, + "num_tokens": 254306615.0, + "step": 2378 + }, + { + "epoch": 5.419612314709236, + "grad_norm": 7.15625, + "learning_rate": 2.3625775351651193e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8790160417556763, + "num_tokens": 254413447.0, + "step": 2379 + }, + { + "epoch": 5.4218928164196125, + "grad_norm": 3.796875, + "learning_rate": 2.3606969731663683e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8772003203630447, + "num_tokens": 254520623.0, + "step": 2380 + }, + { + "epoch": 5.424173318129989, + "grad_norm": 2.859375, + "learning_rate": 2.358816490233326e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.8835051357746124, + "num_tokens": 254627369.0, + "step": 2381 + }, + { + "epoch": 5.426453819840365, + "grad_norm": 3.40625, + "learning_rate": 2.356936087433318e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8843741118907928, + "num_tokens": 254734806.0, + "step": 2382 + }, + { + "epoch": 5.428734321550741, + "grad_norm": 3.203125, + "learning_rate": 2.3550557658336245e-06, + "loss": 0.5727, + "mean_token_accuracy": 0.881999284029007, + "num_tokens": 254841536.0, + "step": 2383 + }, + { + "epoch": 5.431014823261117, + "grad_norm": 2.84375, + "learning_rate": 2.3531755265014818e-06, + "loss": 0.5481, + "mean_token_accuracy": 0.8869676440954208, + "num_tokens": 254949872.0, + "step": 2384 + }, + { + "epoch": 5.4332953249714935, + "grad_norm": 3.234375, + "learning_rate": 2.3512953705040737e-06, + "loss": 0.525, + "mean_token_accuracy": 0.894694447517395, + "num_tokens": 255057725.0, + "step": 2385 + }, + { + "epoch": 5.43557582668187, + "grad_norm": 4.90625, + "learning_rate": 2.3494152989085433e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.888728454709053, + "num_tokens": 255165920.0, + "step": 2386 + }, + { + "epoch": 5.437856328392247, + "grad_norm": 4.28125, + "learning_rate": 2.3475353127819827e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8810894787311554, + "num_tokens": 255273480.0, + "step": 2387 + }, + { + "epoch": 5.440136830102623, + "grad_norm": 3.328125, + "learning_rate": 2.345655413191434e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.880548432469368, + "num_tokens": 255380729.0, + "step": 2388 + }, + { + "epoch": 5.442417331812999, + "grad_norm": 3.296875, + "learning_rate": 2.3437756012038933e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.882551446557045, + "num_tokens": 255488672.0, + "step": 2389 + }, + { + "epoch": 5.444697833523375, + "grad_norm": 2.609375, + "learning_rate": 2.341895877886306e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8806143552064896, + "num_tokens": 255595607.0, + "step": 2390 + }, + { + "epoch": 5.4469783352337515, + "grad_norm": 2.671875, + "learning_rate": 2.3400162443055655e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8818208873271942, + "num_tokens": 255702745.0, + "step": 2391 + }, + { + "epoch": 5.449258836944128, + "grad_norm": 2.625, + "learning_rate": 2.338136701528516e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.8838256001472473, + "num_tokens": 255810202.0, + "step": 2392 + }, + { + "epoch": 5.451539338654504, + "grad_norm": 3.859375, + "learning_rate": 2.33625725062195e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8813562989234924, + "num_tokens": 255916916.0, + "step": 2393 + }, + { + "epoch": 5.45381984036488, + "grad_norm": 2.765625, + "learning_rate": 2.3343778926526074e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8832430243492126, + "num_tokens": 256023471.0, + "step": 2394 + }, + { + "epoch": 5.456100342075256, + "grad_norm": 3.125, + "learning_rate": 2.332498628687176e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8809223771095276, + "num_tokens": 256130280.0, + "step": 2395 + }, + { + "epoch": 5.4583808437856325, + "grad_norm": 3.71875, + "learning_rate": 2.330619459792289e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8768937885761261, + "num_tokens": 256236939.0, + "step": 2396 + }, + { + "epoch": 5.460661345496009, + "grad_norm": 2.703125, + "learning_rate": 2.328740387034526e-06, + "loss": 0.587, + "mean_token_accuracy": 0.8795687556266785, + "num_tokens": 256344069.0, + "step": 2397 + }, + { + "epoch": 5.462941847206386, + "grad_norm": 2.609375, + "learning_rate": 2.326861411480414e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8894121795892715, + "num_tokens": 256450947.0, + "step": 2398 + }, + { + "epoch": 5.465222348916762, + "grad_norm": 2.65625, + "learning_rate": 2.324982534196421e-06, + "loss": 0.5617, + "mean_token_accuracy": 0.8822938203811646, + "num_tokens": 256558445.0, + "step": 2399 + }, + { + "epoch": 5.467502850627138, + "grad_norm": 3.3125, + "learning_rate": 2.3231037562489636e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8841269761323929, + "num_tokens": 256665737.0, + "step": 2400 + }, + { + "epoch": 5.469783352337514, + "grad_norm": 2.90625, + "learning_rate": 2.321225078704399e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.8798850178718567, + "num_tokens": 256772658.0, + "step": 2401 + }, + { + "epoch": 5.4720638540478905, + "grad_norm": 2.9375, + "learning_rate": 2.319346502629028e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.883221909403801, + "num_tokens": 256879693.0, + "step": 2402 + }, + { + "epoch": 5.474344355758267, + "grad_norm": 3.78125, + "learning_rate": 2.3174680290890945e-06, + "loss": 0.596, + "mean_token_accuracy": 0.882306694984436, + "num_tokens": 256986979.0, + "step": 2403 + }, + { + "epoch": 5.476624857468643, + "grad_norm": 2.890625, + "learning_rate": 2.315589659150784e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.8786382675170898, + "num_tokens": 257093930.0, + "step": 2404 + }, + { + "epoch": 5.478905359179019, + "grad_norm": 4.03125, + "learning_rate": 2.3137113938802224e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.8781066983938217, + "num_tokens": 257201360.0, + "step": 2405 + }, + { + "epoch": 5.481185860889395, + "grad_norm": 3.96875, + "learning_rate": 2.311833234343478e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8780955076217651, + "num_tokens": 257308294.0, + "step": 2406 + }, + { + "epoch": 5.483466362599772, + "grad_norm": 2.71875, + "learning_rate": 2.3099551816065563e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8727833181619644, + "num_tokens": 257414698.0, + "step": 2407 + }, + { + "epoch": 5.485746864310149, + "grad_norm": 3.125, + "learning_rate": 2.3080772367354046e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.888285905122757, + "num_tokens": 257521674.0, + "step": 2408 + }, + { + "epoch": 5.488027366020525, + "grad_norm": 3.9375, + "learning_rate": 2.3061994007959086e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8786954879760742, + "num_tokens": 257628383.0, + "step": 2409 + }, + { + "epoch": 5.490307867730901, + "grad_norm": 2.515625, + "learning_rate": 2.304321674853891e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8847887217998505, + "num_tokens": 257735646.0, + "step": 2410 + }, + { + "epoch": 5.492588369441277, + "grad_norm": 4.65625, + "learning_rate": 2.3024440599751132e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.8823476582765579, + "num_tokens": 257843020.0, + "step": 2411 + }, + { + "epoch": 5.494868871151653, + "grad_norm": 2.984375, + "learning_rate": 2.3005665572252732e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.8751996457576752, + "num_tokens": 257949651.0, + "step": 2412 + }, + { + "epoch": 5.4971493728620295, + "grad_norm": 3.390625, + "learning_rate": 2.2986891676700042e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8818869143724442, + "num_tokens": 258056645.0, + "step": 2413 + }, + { + "epoch": 5.499429874572406, + "grad_norm": 3.921875, + "learning_rate": 2.296811892374878e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8861507475376129, + "num_tokens": 258163786.0, + "step": 2414 + }, + { + "epoch": 5.501710376282782, + "grad_norm": 2.796875, + "learning_rate": 2.294934732405398e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8789652734994888, + "num_tokens": 258271199.0, + "step": 2415 + }, + { + "epoch": 5.503990877993158, + "grad_norm": 3.40625, + "learning_rate": 2.293057688827007e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.8831315487623215, + "num_tokens": 258378688.0, + "step": 2416 + }, + { + "epoch": 5.506271379703534, + "grad_norm": 2.78125, + "learning_rate": 2.2911807627050745e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.8817677646875381, + "num_tokens": 258486085.0, + "step": 2417 + }, + { + "epoch": 5.508551881413911, + "grad_norm": 3.265625, + "learning_rate": 2.2893039551049104e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.8783139288425446, + "num_tokens": 258593269.0, + "step": 2418 + }, + { + "epoch": 5.510832383124288, + "grad_norm": 2.78125, + "learning_rate": 2.2874272670917534e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.8795038163661957, + "num_tokens": 258700704.0, + "step": 2419 + }, + { + "epoch": 5.513112884834664, + "grad_norm": 4.40625, + "learning_rate": 2.2855506997307766e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.8858631104230881, + "num_tokens": 258807861.0, + "step": 2420 + }, + { + "epoch": 5.513112884834664, + "eval_loss": 0.5877215266227722, + "eval_mean_token_accuracy": 0.8795312627186793, + "eval_num_tokens": 258807861.0, + "eval_runtime": 58.5506, + "eval_samples_per_second": 143.21, + "eval_steps_per_second": 4.492, + "step": 2420 + }, + { + "epoch": 5.51539338654504, + "grad_norm": 5.46875, + "learning_rate": 2.283674254087082e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8837638050317764, + "num_tokens": 258914737.0, + "step": 2421 + }, + { + "epoch": 5.517673888255416, + "grad_norm": 3.390625, + "learning_rate": 2.281797931225705e-06, + "loss": 0.6097, + "mean_token_accuracy": 0.8738208562135696, + "num_tokens": 259021178.0, + "step": 2422 + }, + { + "epoch": 5.519954389965792, + "grad_norm": 2.6875, + "learning_rate": 2.279921732211609e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8749384880065918, + "num_tokens": 259127761.0, + "step": 2423 + }, + { + "epoch": 5.5222348916761685, + "grad_norm": 2.453125, + "learning_rate": 2.278045658109689e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8829948306083679, + "num_tokens": 259234696.0, + "step": 2424 + }, + { + "epoch": 5.524515393386545, + "grad_norm": 3.21875, + "learning_rate": 2.2761697099847686e-06, + "loss": 0.59, + "mean_token_accuracy": 0.8792230039834976, + "num_tokens": 259341237.0, + "step": 2425 + }, + { + "epoch": 5.526795895096921, + "grad_norm": 3.796875, + "learning_rate": 2.274293888901599e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8843846768140793, + "num_tokens": 259448732.0, + "step": 2426 + }, + { + "epoch": 5.529076396807298, + "grad_norm": 3.484375, + "learning_rate": 2.2724181959248627e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.8816651403903961, + "num_tokens": 259555843.0, + "step": 2427 + }, + { + "epoch": 5.531356898517674, + "grad_norm": 3.46875, + "learning_rate": 2.270542632119163e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8784616589546204, + "num_tokens": 259662832.0, + "step": 2428 + }, + { + "epoch": 5.53363740022805, + "grad_norm": 2.703125, + "learning_rate": 2.2686671985490355e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8840401917695999, + "num_tokens": 259770125.0, + "step": 2429 + }, + { + "epoch": 5.535917901938427, + "grad_norm": 5.78125, + "learning_rate": 2.26679189627894e-06, + "loss": 0.5922, + "mean_token_accuracy": 0.8798678070306778, + "num_tokens": 259876661.0, + "step": 2430 + }, + { + "epoch": 5.538198403648803, + "grad_norm": 2.859375, + "learning_rate": 2.264916726373263e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.8790270835161209, + "num_tokens": 259982852.0, + "step": 2431 + }, + { + "epoch": 5.540478905359179, + "grad_norm": 5.375, + "learning_rate": 2.263041689896313e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.881781816482544, + "num_tokens": 260089664.0, + "step": 2432 + }, + { + "epoch": 5.542759407069555, + "grad_norm": 3.796875, + "learning_rate": 2.261166787912325e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.8784068375825882, + "num_tokens": 260196663.0, + "step": 2433 + }, + { + "epoch": 5.545039908779931, + "grad_norm": 2.96875, + "learning_rate": 2.2592920214854573e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8845723420381546, + "num_tokens": 260303381.0, + "step": 2434 + }, + { + "epoch": 5.5473204104903076, + "grad_norm": 3.109375, + "learning_rate": 2.2574173916797912e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.8818473219871521, + "num_tokens": 260410524.0, + "step": 2435 + }, + { + "epoch": 5.549600912200685, + "grad_norm": 2.765625, + "learning_rate": 2.2555428995593303e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8806977868080139, + "num_tokens": 260517537.0, + "step": 2436 + }, + { + "epoch": 5.55188141391106, + "grad_norm": 6.15625, + "learning_rate": 2.253668546188e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8790431469678879, + "num_tokens": 260624932.0, + "step": 2437 + }, + { + "epoch": 5.554161915621437, + "grad_norm": 4.15625, + "learning_rate": 2.2517943326296487e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8867908716201782, + "num_tokens": 260732712.0, + "step": 2438 + }, + { + "epoch": 5.556442417331813, + "grad_norm": 3.8125, + "learning_rate": 2.249920259948041e-06, + "loss": 0.5552, + "mean_token_accuracy": 0.8851701766252518, + "num_tokens": 260840731.0, + "step": 2439 + }, + { + "epoch": 5.558722919042189, + "grad_norm": 3.828125, + "learning_rate": 2.2480463292068655e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.8820369690656662, + "num_tokens": 260947477.0, + "step": 2440 + }, + { + "epoch": 5.561003420752566, + "grad_norm": 3.453125, + "learning_rate": 2.24617254146973e-06, + "loss": 0.5957, + "mean_token_accuracy": 0.879251167178154, + "num_tokens": 261054404.0, + "step": 2441 + }, + { + "epoch": 5.563283922462942, + "grad_norm": 4.0625, + "learning_rate": 2.2442988978001594e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8821232914924622, + "num_tokens": 261162432.0, + "step": 2442 + }, + { + "epoch": 5.565564424173318, + "grad_norm": 3.046875, + "learning_rate": 2.2424253992615983e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8804102689027786, + "num_tokens": 261269127.0, + "step": 2443 + }, + { + "epoch": 5.567844925883694, + "grad_norm": 5.65625, + "learning_rate": 2.2405520469174084e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8801386058330536, + "num_tokens": 261376055.0, + "step": 2444 + }, + { + "epoch": 5.57012542759407, + "grad_norm": 2.546875, + "learning_rate": 2.238678841830867e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8823980838060379, + "num_tokens": 261482935.0, + "step": 2445 + }, + { + "epoch": 5.572405929304447, + "grad_norm": 5.5625, + "learning_rate": 2.23680578506517e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8805437237024307, + "num_tokens": 261589878.0, + "step": 2446 + }, + { + "epoch": 5.574686431014824, + "grad_norm": 2.65625, + "learning_rate": 2.234932877683428e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.878250241279602, + "num_tokens": 261696700.0, + "step": 2447 + }, + { + "epoch": 5.5769669327252, + "grad_norm": 4.28125, + "learning_rate": 2.233060120748667e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8817458301782608, + "num_tokens": 261803758.0, + "step": 2448 + }, + { + "epoch": 5.579247434435576, + "grad_norm": 3.421875, + "learning_rate": 2.2311875153238296e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8814815580844879, + "num_tokens": 261910576.0, + "step": 2449 + }, + { + "epoch": 5.581527936145952, + "grad_norm": 2.8125, + "learning_rate": 2.229315062471768e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8858354687690735, + "num_tokens": 262017627.0, + "step": 2450 + }, + { + "epoch": 5.583808437856328, + "grad_norm": 5.59375, + "learning_rate": 2.2274427632552507e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.8767371773719788, + "num_tokens": 262124172.0, + "step": 2451 + }, + { + "epoch": 5.586088939566705, + "grad_norm": 5.96875, + "learning_rate": 2.2255706187369596e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8821327090263367, + "num_tokens": 262231439.0, + "step": 2452 + }, + { + "epoch": 5.588369441277081, + "grad_norm": 5.46875, + "learning_rate": 2.223698629979487e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8792487680912018, + "num_tokens": 262338613.0, + "step": 2453 + }, + { + "epoch": 5.590649942987457, + "grad_norm": 5.65625, + "learning_rate": 2.221826798045338e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8853352516889572, + "num_tokens": 262445246.0, + "step": 2454 + }, + { + "epoch": 5.592930444697833, + "grad_norm": 5.34375, + "learning_rate": 2.2199551239969284e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8816657811403275, + "num_tokens": 262552338.0, + "step": 2455 + }, + { + "epoch": 5.59521094640821, + "grad_norm": 3.3125, + "learning_rate": 2.2180836088965833e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8782707005739212, + "num_tokens": 262659476.0, + "step": 2456 + }, + { + "epoch": 5.5974914481185865, + "grad_norm": 2.9375, + "learning_rate": 2.216212253806539e-06, + "loss": 0.567, + "mean_token_accuracy": 0.88326196372509, + "num_tokens": 262766605.0, + "step": 2457 + }, + { + "epoch": 5.599771949828963, + "grad_norm": 3.140625, + "learning_rate": 2.214341059788941e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8842233419418335, + "num_tokens": 262873710.0, + "step": 2458 + }, + { + "epoch": 5.602052451539339, + "grad_norm": 3.21875, + "learning_rate": 2.2124700279058435e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8748954981565475, + "num_tokens": 262980715.0, + "step": 2459 + }, + { + "epoch": 5.604332953249715, + "grad_norm": 5.28125, + "learning_rate": 2.2105991592192063e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.8786115646362305, + "num_tokens": 263087540.0, + "step": 2460 + }, + { + "epoch": 5.606613454960091, + "grad_norm": 3.53125, + "learning_rate": 2.208728454790899e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8810366690158844, + "num_tokens": 263194654.0, + "step": 2461 + }, + { + "epoch": 5.608893956670467, + "grad_norm": 4.3125, + "learning_rate": 2.2068579156826974e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.8800259828567505, + "num_tokens": 263302451.0, + "step": 2462 + }, + { + "epoch": 5.611174458380844, + "grad_norm": 2.65625, + "learning_rate": 2.2049875429562845e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8829780966043472, + "num_tokens": 263409487.0, + "step": 2463 + }, + { + "epoch": 5.61345496009122, + "grad_norm": 2.640625, + "learning_rate": 2.203117337673246e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8871753662824631, + "num_tokens": 263516373.0, + "step": 2464 + }, + { + "epoch": 5.615735461801596, + "grad_norm": 3.734375, + "learning_rate": 2.2012473008950756e-06, + "loss": 0.5481, + "mean_token_accuracy": 0.8859710693359375, + "num_tokens": 263624001.0, + "step": 2465 + }, + { + "epoch": 5.618015963511972, + "grad_norm": 4.4375, + "learning_rate": 2.1993774336831696e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8810239285230637, + "num_tokens": 263731549.0, + "step": 2466 + }, + { + "epoch": 5.620296465222349, + "grad_norm": 3.171875, + "learning_rate": 2.197507737098828e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8845034837722778, + "num_tokens": 263838875.0, + "step": 2467 + }, + { + "epoch": 5.6225769669327255, + "grad_norm": 4.59375, + "learning_rate": 2.195638212203255e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.881766065955162, + "num_tokens": 263945489.0, + "step": 2468 + }, + { + "epoch": 5.624857468643102, + "grad_norm": 3.09375, + "learning_rate": 2.193768860057557e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8849270343780518, + "num_tokens": 264052839.0, + "step": 2469 + }, + { + "epoch": 5.627137970353478, + "grad_norm": 2.984375, + "learning_rate": 2.191899681722743e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8784494251012802, + "num_tokens": 264159607.0, + "step": 2470 + }, + { + "epoch": 5.629418472063854, + "grad_norm": 4.15625, + "learning_rate": 2.19003067825972e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.8801737427711487, + "num_tokens": 264266733.0, + "step": 2471 + }, + { + "epoch": 5.63169897377423, + "grad_norm": 3.125, + "learning_rate": 2.1881618507293004e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.8845822513103485, + "num_tokens": 264374129.0, + "step": 2472 + }, + { + "epoch": 5.633979475484606, + "grad_norm": 2.546875, + "learning_rate": 2.186293200192194e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8821998536586761, + "num_tokens": 264481221.0, + "step": 2473 + }, + { + "epoch": 5.636259977194983, + "grad_norm": 2.640625, + "learning_rate": 2.1844247277090113e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8810071498155594, + "num_tokens": 264588602.0, + "step": 2474 + }, + { + "epoch": 5.638540478905359, + "grad_norm": 3.90625, + "learning_rate": 2.1825564343402606e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.8767081648111343, + "num_tokens": 264695289.0, + "step": 2475 + }, + { + "epoch": 5.640820980615736, + "grad_norm": 4.71875, + "learning_rate": 2.180688321146349e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.8806226551532745, + "num_tokens": 264801458.0, + "step": 2476 + }, + { + "epoch": 5.643101482326112, + "grad_norm": 3.484375, + "learning_rate": 2.1788203891875818e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8851145952939987, + "num_tokens": 264908510.0, + "step": 2477 + }, + { + "epoch": 5.645381984036488, + "grad_norm": 2.890625, + "learning_rate": 2.176952639524161e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8833800852298737, + "num_tokens": 265015635.0, + "step": 2478 + }, + { + "epoch": 5.6476624857468645, + "grad_norm": 3.796875, + "learning_rate": 2.175085073216185e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.8817281723022461, + "num_tokens": 265122707.0, + "step": 2479 + }, + { + "epoch": 5.649942987457241, + "grad_norm": 2.6875, + "learning_rate": 2.173217691323649e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8875688314437866, + "num_tokens": 265229807.0, + "step": 2480 + }, + { + "epoch": 5.652223489167617, + "grad_norm": 2.78125, + "learning_rate": 2.1713504949064433e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8914333879947662, + "num_tokens": 265337470.0, + "step": 2481 + }, + { + "epoch": 5.654503990877993, + "grad_norm": 3.125, + "learning_rate": 2.169483485024351e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8841661512851715, + "num_tokens": 265444298.0, + "step": 2482 + }, + { + "epoch": 5.656784492588369, + "grad_norm": 3.125, + "learning_rate": 2.167616662737052e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8871481865644455, + "num_tokens": 265551726.0, + "step": 2483 + }, + { + "epoch": 5.659064994298745, + "grad_norm": 4.0, + "learning_rate": 2.1657500291041185e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8866736739873886, + "num_tokens": 265659350.0, + "step": 2484 + }, + { + "epoch": 5.661345496009122, + "grad_norm": 3.75, + "learning_rate": 2.1638835851850155e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8781305104494095, + "num_tokens": 265766187.0, + "step": 2485 + }, + { + "epoch": 5.663625997719498, + "grad_norm": 2.8125, + "learning_rate": 2.1620173320391007e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8814955800771713, + "num_tokens": 265873426.0, + "step": 2486 + }, + { + "epoch": 5.665906499429875, + "grad_norm": 2.65625, + "learning_rate": 2.160151270725623e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8812149912118912, + "num_tokens": 265980661.0, + "step": 2487 + }, + { + "epoch": 5.668187001140251, + "grad_norm": 5.96875, + "learning_rate": 2.158285402303723e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8758508861064911, + "num_tokens": 266087907.0, + "step": 2488 + }, + { + "epoch": 5.670467502850627, + "grad_norm": 3.203125, + "learning_rate": 2.1564197278324317e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8836510181427002, + "num_tokens": 266195252.0, + "step": 2489 + }, + { + "epoch": 5.6727480045610035, + "grad_norm": 3.296875, + "learning_rate": 2.1545542483706694e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8824823647737503, + "num_tokens": 266302369.0, + "step": 2490 + }, + { + "epoch": 5.67502850627138, + "grad_norm": 2.625, + "learning_rate": 2.1526889649772477e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.885822594165802, + "num_tokens": 266409450.0, + "step": 2491 + }, + { + "epoch": 5.677309007981756, + "grad_norm": 3.578125, + "learning_rate": 2.1508238787108633e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8812492936849594, + "num_tokens": 266516611.0, + "step": 2492 + }, + { + "epoch": 5.679589509692132, + "grad_norm": 5.28125, + "learning_rate": 2.1489589906301046e-06, + "loss": 0.5951, + "mean_token_accuracy": 0.8759992122650146, + "num_tokens": 266623332.0, + "step": 2493 + }, + { + "epoch": 5.681870011402508, + "grad_norm": 3.453125, + "learning_rate": 2.1470943017934455e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8840955346822739, + "num_tokens": 266730787.0, + "step": 2494 + }, + { + "epoch": 5.684150513112884, + "grad_norm": 3.265625, + "learning_rate": 2.145229813259248e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8826257884502411, + "num_tokens": 266838139.0, + "step": 2495 + }, + { + "epoch": 5.6864310148232615, + "grad_norm": 2.90625, + "learning_rate": 2.143365526085759e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.88275146484375, + "num_tokens": 266944920.0, + "step": 2496 + }, + { + "epoch": 5.688711516533638, + "grad_norm": 2.765625, + "learning_rate": 2.1415014413311126e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.880971685051918, + "num_tokens": 267051841.0, + "step": 2497 + }, + { + "epoch": 5.690992018244014, + "grad_norm": 3.09375, + "learning_rate": 2.139637560053327e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.8808932155370712, + "num_tokens": 267159338.0, + "step": 2498 + }, + { + "epoch": 5.69327251995439, + "grad_norm": 3.65625, + "learning_rate": 2.137773883310305e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8823635280132294, + "num_tokens": 267266461.0, + "step": 2499 + }, + { + "epoch": 5.695553021664766, + "grad_norm": 3.28125, + "learning_rate": 2.1359104121598337e-06, + "loss": 0.58, + "mean_token_accuracy": 0.881933718919754, + "num_tokens": 267373196.0, + "step": 2500 + }, + { + "epoch": 5.6978335233751425, + "grad_norm": 4.0, + "learning_rate": 2.1340471476595836e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8774979561567307, + "num_tokens": 267480795.0, + "step": 2501 + }, + { + "epoch": 5.700114025085519, + "grad_norm": 2.671875, + "learning_rate": 2.1321840908671082e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.8825899064540863, + "num_tokens": 267587894.0, + "step": 2502 + }, + { + "epoch": 5.702394526795895, + "grad_norm": 3.96875, + "learning_rate": 2.1303212428398407e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8776924163103104, + "num_tokens": 267694535.0, + "step": 2503 + }, + { + "epoch": 5.704675028506271, + "grad_norm": 3.296875, + "learning_rate": 2.1284586046350996e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8794005513191223, + "num_tokens": 267801843.0, + "step": 2504 + }, + { + "epoch": 5.706955530216648, + "grad_norm": 2.921875, + "learning_rate": 2.126596177310081e-06, + "loss": 0.5546, + "mean_token_accuracy": 0.8848482072353363, + "num_tokens": 267909136.0, + "step": 2505 + }, + { + "epoch": 5.7092360319270234, + "grad_norm": 3.125, + "learning_rate": 2.124733961921864e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8841657936573029, + "num_tokens": 268016448.0, + "step": 2506 + }, + { + "epoch": 5.7115165336374005, + "grad_norm": 3.484375, + "learning_rate": 2.1228719595274056e-06, + "loss": 0.6123, + "mean_token_accuracy": 0.8750414401292801, + "num_tokens": 268123331.0, + "step": 2507 + }, + { + "epoch": 5.713797035347777, + "grad_norm": 3.9375, + "learning_rate": 2.1210101711835413e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8833545595407486, + "num_tokens": 268230339.0, + "step": 2508 + }, + { + "epoch": 5.716077537058153, + "grad_norm": 5.53125, + "learning_rate": 2.1191485979469877e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.880239725112915, + "num_tokens": 268337047.0, + "step": 2509 + }, + { + "epoch": 5.718358038768529, + "grad_norm": 3.765625, + "learning_rate": 2.1172872408743374e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8804043382406235, + "num_tokens": 268444006.0, + "step": 2510 + }, + { + "epoch": 5.720638540478905, + "grad_norm": 4.5625, + "learning_rate": 2.11542610102206e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.8754177987575531, + "num_tokens": 268550836.0, + "step": 2511 + }, + { + "epoch": 5.7229190421892815, + "grad_norm": 3.390625, + "learning_rate": 2.1135651794465032e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8840764611959457, + "num_tokens": 268658140.0, + "step": 2512 + }, + { + "epoch": 5.725199543899658, + "grad_norm": 4.09375, + "learning_rate": 2.1117044772038915e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8794988542795181, + "num_tokens": 268764953.0, + "step": 2513 + }, + { + "epoch": 5.727480045610034, + "grad_norm": 4.40625, + "learning_rate": 2.1098439953503207e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8798924386501312, + "num_tokens": 268872046.0, + "step": 2514 + }, + { + "epoch": 5.72976054732041, + "grad_norm": 5.96875, + "learning_rate": 2.1079837349417664e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8788573145866394, + "num_tokens": 268978631.0, + "step": 2515 + }, + { + "epoch": 5.732041049030787, + "grad_norm": 4.375, + "learning_rate": 2.1061236970340756e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.8862823694944382, + "num_tokens": 269085803.0, + "step": 2516 + }, + { + "epoch": 5.734321550741163, + "grad_norm": 3.171875, + "learning_rate": 2.104263882682971e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8813868910074234, + "num_tokens": 269192710.0, + "step": 2517 + }, + { + "epoch": 5.7366020524515395, + "grad_norm": 3.671875, + "learning_rate": 2.1024042929440465e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8835171908140182, + "num_tokens": 269299690.0, + "step": 2518 + }, + { + "epoch": 5.738882554161916, + "grad_norm": 3.0, + "learning_rate": 2.1005449288727696e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.8817470818758011, + "num_tokens": 269406982.0, + "step": 2519 + }, + { + "epoch": 5.741163055872292, + "grad_norm": 3.109375, + "learning_rate": 2.0986857915244787e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8834887593984604, + "num_tokens": 269514394.0, + "step": 2520 + }, + { + "epoch": 5.743443557582668, + "grad_norm": 3.140625, + "learning_rate": 2.096826881954385e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8815822005271912, + "num_tokens": 269621655.0, + "step": 2521 + }, + { + "epoch": 5.745724059293044, + "grad_norm": 3.296875, + "learning_rate": 2.0949682012175693e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8810020089149475, + "num_tokens": 269729019.0, + "step": 2522 + }, + { + "epoch": 5.7480045610034205, + "grad_norm": 2.734375, + "learning_rate": 2.093109750368983e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8838683366775513, + "num_tokens": 269836494.0, + "step": 2523 + }, + { + "epoch": 5.750285062713797, + "grad_norm": 3.890625, + "learning_rate": 2.0912515304634485e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8854202926158905, + "num_tokens": 269943579.0, + "step": 2524 + }, + { + "epoch": 5.752565564424174, + "grad_norm": 3.234375, + "learning_rate": 2.089393542555653e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8842357248067856, + "num_tokens": 270050301.0, + "step": 2525 + }, + { + "epoch": 5.75484606613455, + "grad_norm": 3.328125, + "learning_rate": 2.0875357877001556e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8842559903860092, + "num_tokens": 270157064.0, + "step": 2526 + }, + { + "epoch": 5.757126567844926, + "grad_norm": 4.625, + "learning_rate": 2.085678266951382e-06, + "loss": 0.5951, + "mean_token_accuracy": 0.8778895437717438, + "num_tokens": 270263910.0, + "step": 2527 + }, + { + "epoch": 5.759407069555302, + "grad_norm": 4.5, + "learning_rate": 2.083820981363626e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8812290579080582, + "num_tokens": 270371067.0, + "step": 2528 + }, + { + "epoch": 5.7616875712656785, + "grad_norm": 2.5, + "learning_rate": 2.0819639319910466e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8800959140062332, + "num_tokens": 270478072.0, + "step": 2529 + }, + { + "epoch": 5.763968072976055, + "grad_norm": 2.6875, + "learning_rate": 2.0801071198876684e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8809636831283569, + "num_tokens": 270584725.0, + "step": 2530 + }, + { + "epoch": 5.766248574686431, + "grad_norm": 3.03125, + "learning_rate": 2.0782505461073822e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.8816069960594177, + "num_tokens": 270691204.0, + "step": 2531 + }, + { + "epoch": 5.768529076396807, + "grad_norm": 5.15625, + "learning_rate": 2.076394211703944e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8873765766620636, + "num_tokens": 270798291.0, + "step": 2532 + }, + { + "epoch": 5.770809578107183, + "grad_norm": 4.5625, + "learning_rate": 2.0745381177309732e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8861078023910522, + "num_tokens": 270905483.0, + "step": 2533 + }, + { + "epoch": 5.7730900798175595, + "grad_norm": 2.984375, + "learning_rate": 2.072682265241954e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8886701166629791, + "num_tokens": 271012509.0, + "step": 2534 + }, + { + "epoch": 5.775370581527936, + "grad_norm": 5.21875, + "learning_rate": 2.0708266552902303e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8834939152002335, + "num_tokens": 271119387.0, + "step": 2535 + }, + { + "epoch": 5.777651083238313, + "grad_norm": 4.34375, + "learning_rate": 2.0689712889290114e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8821908384561539, + "num_tokens": 271226607.0, + "step": 2536 + }, + { + "epoch": 5.779931584948689, + "grad_norm": 3.25, + "learning_rate": 2.0671161672113677e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8805604577064514, + "num_tokens": 271333818.0, + "step": 2537 + }, + { + "epoch": 5.782212086659065, + "grad_norm": 2.375, + "learning_rate": 2.06526129119023e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8876101672649384, + "num_tokens": 271440910.0, + "step": 2538 + }, + { + "epoch": 5.784492588369441, + "grad_norm": 2.78125, + "learning_rate": 2.063406661918391e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8823280781507492, + "num_tokens": 271548224.0, + "step": 2539 + }, + { + "epoch": 5.7867730900798175, + "grad_norm": 3.234375, + "learning_rate": 2.0615522804485027e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8831372708082199, + "num_tokens": 271655042.0, + "step": 2540 + }, + { + "epoch": 5.789053591790194, + "grad_norm": 2.5, + "learning_rate": 2.059698147833075e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8820475935935974, + "num_tokens": 271762363.0, + "step": 2541 + }, + { + "epoch": 5.79133409350057, + "grad_norm": 3.484375, + "learning_rate": 2.0578442651244774e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8825207501649857, + "num_tokens": 271869380.0, + "step": 2542 + }, + { + "epoch": 5.793614595210946, + "grad_norm": 5.1875, + "learning_rate": 2.0559906333749392e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8834405839443207, + "num_tokens": 271976735.0, + "step": 2543 + }, + { + "epoch": 5.795895096921322, + "grad_norm": 3.1875, + "learning_rate": 2.054137253636545e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8806865364313126, + "num_tokens": 272084686.0, + "step": 2544 + }, + { + "epoch": 5.798175598631699, + "grad_norm": 2.859375, + "learning_rate": 2.0522841269612397e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.8805492520332336, + "num_tokens": 272192178.0, + "step": 2545 + }, + { + "epoch": 5.800456100342076, + "grad_norm": 3.671875, + "learning_rate": 2.0504312544008193e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.883878156542778, + "num_tokens": 272298806.0, + "step": 2546 + }, + { + "epoch": 5.802736602052452, + "grad_norm": 3.640625, + "learning_rate": 2.048578637006939e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8853472024202347, + "num_tokens": 272405957.0, + "step": 2547 + }, + { + "epoch": 5.805017103762828, + "grad_norm": 2.6875, + "learning_rate": 2.04672627583111e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8789206445217133, + "num_tokens": 272513063.0, + "step": 2548 + }, + { + "epoch": 5.807297605473204, + "grad_norm": 2.78125, + "learning_rate": 2.0448741719246962e-06, + "loss": 0.569, + "mean_token_accuracy": 0.883039727807045, + "num_tokens": 272619770.0, + "step": 2549 + }, + { + "epoch": 5.80957810718358, + "grad_norm": 2.75, + "learning_rate": 2.043022326338916e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8768695294857025, + "num_tokens": 272726957.0, + "step": 2550 + }, + { + "epoch": 5.811858608893957, + "grad_norm": 3.65625, + "learning_rate": 2.0411707401248406e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8837281614542007, + "num_tokens": 272834440.0, + "step": 2551 + }, + { + "epoch": 5.814139110604333, + "grad_norm": 3.046875, + "learning_rate": 2.0393194143333956e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.8858349472284317, + "num_tokens": 272941653.0, + "step": 2552 + }, + { + "epoch": 5.816419612314709, + "grad_norm": 4.21875, + "learning_rate": 2.0374683500153564e-06, + "loss": 0.567, + "mean_token_accuracy": 0.884008064866066, + "num_tokens": 273048925.0, + "step": 2553 + }, + { + "epoch": 5.818700114025085, + "grad_norm": 5.0625, + "learning_rate": 2.0356175482213523e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8851135820150375, + "num_tokens": 273155880.0, + "step": 2554 + }, + { + "epoch": 5.820980615735461, + "grad_norm": 2.8125, + "learning_rate": 2.033767010001863e-06, + "loss": 0.5922, + "mean_token_accuracy": 0.8770723342895508, + "num_tokens": 273262101.0, + "step": 2555 + }, + { + "epoch": 5.823261117445838, + "grad_norm": 3.0625, + "learning_rate": 2.0319167364072184e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8830315619707108, + "num_tokens": 273369542.0, + "step": 2556 + }, + { + "epoch": 5.825541619156215, + "grad_norm": 3.40625, + "learning_rate": 2.0300667284875965e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8794274926185608, + "num_tokens": 273477240.0, + "step": 2557 + }, + { + "epoch": 5.827822120866591, + "grad_norm": 3.765625, + "learning_rate": 2.0282169872930275e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.8839514553546906, + "num_tokens": 273584941.0, + "step": 2558 + }, + { + "epoch": 5.830102622576967, + "grad_norm": 2.5625, + "learning_rate": 2.026367513873388e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.8844399601221085, + "num_tokens": 273692501.0, + "step": 2559 + }, + { + "epoch": 5.832383124287343, + "grad_norm": 3.015625, + "learning_rate": 2.0245183092784046e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8805771768093109, + "num_tokens": 273799051.0, + "step": 2560 + }, + { + "epoch": 5.834663625997719, + "grad_norm": 3.265625, + "learning_rate": 2.0226693745576494e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8822827190160751, + "num_tokens": 273906681.0, + "step": 2561 + }, + { + "epoch": 5.836944127708096, + "grad_norm": 3.046875, + "learning_rate": 2.020820710760541e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8871154636144638, + "num_tokens": 274014464.0, + "step": 2562 + }, + { + "epoch": 5.839224629418472, + "grad_norm": 3.3125, + "learning_rate": 2.018972318936347e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8768787384033203, + "num_tokens": 274121211.0, + "step": 2563 + }, + { + "epoch": 5.841505131128848, + "grad_norm": 3.25, + "learning_rate": 2.017124200134178e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8847302943468094, + "num_tokens": 274228462.0, + "step": 2564 + }, + { + "epoch": 5.843785632839225, + "grad_norm": 3.484375, + "learning_rate": 2.01527635540299e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8782871812582016, + "num_tokens": 274335776.0, + "step": 2565 + }, + { + "epoch": 5.846066134549601, + "grad_norm": 2.609375, + "learning_rate": 2.0134287857915864e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8803008496761322, + "num_tokens": 274443604.0, + "step": 2566 + }, + { + "epoch": 5.848346636259977, + "grad_norm": 2.53125, + "learning_rate": 2.0115814923486093e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8840547949075699, + "num_tokens": 274550591.0, + "step": 2567 + }, + { + "epoch": 5.850627137970354, + "grad_norm": 2.6875, + "learning_rate": 2.009734476122547e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8849809020757675, + "num_tokens": 274657799.0, + "step": 2568 + }, + { + "epoch": 5.85290763968073, + "grad_norm": 2.875, + "learning_rate": 2.007887738161732e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8857974410057068, + "num_tokens": 274765249.0, + "step": 2569 + }, + { + "epoch": 5.855188141391106, + "grad_norm": 2.546875, + "learning_rate": 2.006041279514336e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.8802428096532822, + "num_tokens": 274871966.0, + "step": 2570 + }, + { + "epoch": 5.857468643101482, + "grad_norm": 3.65625, + "learning_rate": 2.004195101228374e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8866171091794968, + "num_tokens": 274979490.0, + "step": 2571 + }, + { + "epoch": 5.859749144811858, + "grad_norm": 2.890625, + "learning_rate": 2.002349204351701e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8811267167329788, + "num_tokens": 275086036.0, + "step": 2572 + }, + { + "epoch": 5.862029646522235, + "grad_norm": 3.46875, + "learning_rate": 2.0005035899320115e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8833982795476913, + "num_tokens": 275193239.0, + "step": 2573 + }, + { + "epoch": 5.864310148232612, + "grad_norm": 3.046875, + "learning_rate": 1.998658259016841e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8820087909698486, + "num_tokens": 275300065.0, + "step": 2574 + }, + { + "epoch": 5.866590649942988, + "grad_norm": 2.84375, + "learning_rate": 1.996813212653564e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8738995492458344, + "num_tokens": 275407231.0, + "step": 2575 + }, + { + "epoch": 5.868871151653364, + "grad_norm": 3.25, + "learning_rate": 1.9949684518893926e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8753588199615479, + "num_tokens": 275514097.0, + "step": 2576 + }, + { + "epoch": 5.87115165336374, + "grad_norm": 2.828125, + "learning_rate": 1.9931239777713794e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8908671289682388, + "num_tokens": 275621304.0, + "step": 2577 + }, + { + "epoch": 5.873432155074116, + "grad_norm": 2.546875, + "learning_rate": 1.9912797913464098e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.8852016925811768, + "num_tokens": 275728465.0, + "step": 2578 + }, + { + "epoch": 5.875712656784493, + "grad_norm": 3.125, + "learning_rate": 1.989435893661209e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8849275410175323, + "num_tokens": 275835525.0, + "step": 2579 + }, + { + "epoch": 5.877993158494869, + "grad_norm": 2.796875, + "learning_rate": 1.9875922857623387e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8849629461765289, + "num_tokens": 275942316.0, + "step": 2580 + }, + { + "epoch": 5.880273660205245, + "grad_norm": 3.171875, + "learning_rate": 1.985748968696194e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8805515915155411, + "num_tokens": 276048810.0, + "step": 2581 + }, + { + "epoch": 5.882554161915621, + "grad_norm": 3.171875, + "learning_rate": 1.9839059435090073e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8853743970394135, + "num_tokens": 276155430.0, + "step": 2582 + }, + { + "epoch": 5.884834663625997, + "grad_norm": 3.921875, + "learning_rate": 1.9820632112468437e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8853378593921661, + "num_tokens": 276262832.0, + "step": 2583 + }, + { + "epoch": 5.887115165336374, + "grad_norm": 3.625, + "learning_rate": 1.9802207729556023e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.876113772392273, + "num_tokens": 276369762.0, + "step": 2584 + }, + { + "epoch": 5.889395667046751, + "grad_norm": 2.734375, + "learning_rate": 1.9783786296810148e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8896660208702087, + "num_tokens": 276477298.0, + "step": 2585 + }, + { + "epoch": 5.891676168757127, + "grad_norm": 3.765625, + "learning_rate": 1.9765367824686467e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8809142112731934, + "num_tokens": 276583840.0, + "step": 2586 + }, + { + "epoch": 5.893956670467503, + "grad_norm": 3.015625, + "learning_rate": 1.974695232363895e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.8866210728883743, + "num_tokens": 276690816.0, + "step": 2587 + }, + { + "epoch": 5.896237172177879, + "grad_norm": 2.859375, + "learning_rate": 1.9728539804119893e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8767704367637634, + "num_tokens": 276797914.0, + "step": 2588 + }, + { + "epoch": 5.898517673888255, + "grad_norm": 4.53125, + "learning_rate": 1.9710130276579864e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8765504211187363, + "num_tokens": 276904386.0, + "step": 2589 + }, + { + "epoch": 5.900798175598632, + "grad_norm": 3.046875, + "learning_rate": 1.969172375146776e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8872781246900558, + "num_tokens": 277011618.0, + "step": 2590 + }, + { + "epoch": 5.903078677309008, + "grad_norm": 2.875, + "learning_rate": 1.9673320239230783e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8807976096868515, + "num_tokens": 277118651.0, + "step": 2591 + }, + { + "epoch": 5.905359179019384, + "grad_norm": 2.5, + "learning_rate": 1.9654919750314396e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8855329602956772, + "num_tokens": 277225439.0, + "step": 2592 + }, + { + "epoch": 5.90763968072976, + "grad_norm": 3.75, + "learning_rate": 1.9636522295162375e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8821413516998291, + "num_tokens": 277332844.0, + "step": 2593 + }, + { + "epoch": 5.909920182440137, + "grad_norm": 4.3125, + "learning_rate": 1.9618127884216753e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8815776854753494, + "num_tokens": 277439892.0, + "step": 2594 + }, + { + "epoch": 5.9122006841505135, + "grad_norm": 2.828125, + "learning_rate": 1.959973652791784e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.885757714509964, + "num_tokens": 277547646.0, + "step": 2595 + }, + { + "epoch": 5.91448118586089, + "grad_norm": 2.625, + "learning_rate": 1.9581348236704217e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.8831838816404343, + "num_tokens": 277654799.0, + "step": 2596 + }, + { + "epoch": 5.916761687571266, + "grad_norm": 3.171875, + "learning_rate": 1.9562963021012723e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8817932456731796, + "num_tokens": 277761954.0, + "step": 2597 + }, + { + "epoch": 5.919042189281642, + "grad_norm": 3.4375, + "learning_rate": 1.954458089127845e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8825689405202866, + "num_tokens": 277869215.0, + "step": 2598 + }, + { + "epoch": 5.921322690992018, + "grad_norm": 3.140625, + "learning_rate": 1.952620185793475e-06, + "loss": 0.572, + "mean_token_accuracy": 0.8813809603452682, + "num_tokens": 277975771.0, + "step": 2599 + }, + { + "epoch": 5.923603192702394, + "grad_norm": 4.46875, + "learning_rate": 1.9507825931413193e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8809202462434769, + "num_tokens": 278082716.0, + "step": 2600 + }, + { + "epoch": 5.925883694412771, + "grad_norm": 2.5, + "learning_rate": 1.9489453122143605e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8862514644861221, + "num_tokens": 278189379.0, + "step": 2601 + }, + { + "epoch": 5.928164196123147, + "grad_norm": 2.671875, + "learning_rate": 1.947108344055404e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8841505497694016, + "num_tokens": 278296342.0, + "step": 2602 + }, + { + "epoch": 5.930444697833523, + "grad_norm": 3.84375, + "learning_rate": 1.9452716897070785e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8791919499635696, + "num_tokens": 278403387.0, + "step": 2603 + }, + { + "epoch": 5.932725199543899, + "grad_norm": 2.578125, + "learning_rate": 1.943435350211832e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8786250501871109, + "num_tokens": 278510323.0, + "step": 2604 + }, + { + "epoch": 5.935005701254276, + "grad_norm": 6.0, + "learning_rate": 1.941599326611935e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8808638751506805, + "num_tokens": 278616886.0, + "step": 2605 + }, + { + "epoch": 5.9372862029646525, + "grad_norm": 3.0, + "learning_rate": 1.939763619949481e-06, + "loss": 0.5821, + "mean_token_accuracy": 0.880496084690094, + "num_tokens": 278724068.0, + "step": 2606 + }, + { + "epoch": 5.939566704675029, + "grad_norm": 2.5625, + "learning_rate": 1.9379282312663797e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8817182034254074, + "num_tokens": 278830958.0, + "step": 2607 + }, + { + "epoch": 5.941847206385405, + "grad_norm": 3.125, + "learning_rate": 1.936093161604363e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8876519352197647, + "num_tokens": 278938000.0, + "step": 2608 + }, + { + "epoch": 5.944127708095781, + "grad_norm": 2.953125, + "learning_rate": 1.9342584120049824e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.876665860414505, + "num_tokens": 279044994.0, + "step": 2609 + }, + { + "epoch": 5.946408209806157, + "grad_norm": 2.890625, + "learning_rate": 1.9324239835096044e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8785818070173264, + "num_tokens": 279152387.0, + "step": 2610 + }, + { + "epoch": 5.9486887115165334, + "grad_norm": 3.46875, + "learning_rate": 1.930589877159415e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8864869177341461, + "num_tokens": 279259832.0, + "step": 2611 + }, + { + "epoch": 5.95096921322691, + "grad_norm": 2.640625, + "learning_rate": 1.928756093995419e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8803494274616241, + "num_tokens": 279366863.0, + "step": 2612 + }, + { + "epoch": 5.953249714937286, + "grad_norm": 2.71875, + "learning_rate": 1.9269226350584357e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8808418363332748, + "num_tokens": 279473717.0, + "step": 2613 + }, + { + "epoch": 5.955530216647663, + "grad_norm": 3.375, + "learning_rate": 1.9250895013891015e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8824566006660461, + "num_tokens": 279580810.0, + "step": 2614 + }, + { + "epoch": 5.957810718358039, + "grad_norm": 3.078125, + "learning_rate": 1.9232566940278675e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8856438547372818, + "num_tokens": 279687888.0, + "step": 2615 + }, + { + "epoch": 5.960091220068415, + "grad_norm": 3.703125, + "learning_rate": 1.9214242140149987e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8792087882757187, + "num_tokens": 279794989.0, + "step": 2616 + }, + { + "epoch": 5.9623717217787915, + "grad_norm": 2.828125, + "learning_rate": 1.9195920623905766e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8819312304258347, + "num_tokens": 279901750.0, + "step": 2617 + }, + { + "epoch": 5.964652223489168, + "grad_norm": 3.46875, + "learning_rate": 1.9177602401944943e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.8836159706115723, + "num_tokens": 280008000.0, + "step": 2618 + }, + { + "epoch": 5.966932725199544, + "grad_norm": 4.03125, + "learning_rate": 1.915928748466459e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8819707930088043, + "num_tokens": 280115535.0, + "step": 2619 + }, + { + "epoch": 5.96921322690992, + "grad_norm": 3.21875, + "learning_rate": 1.9140975882459912e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8795643150806427, + "num_tokens": 280222510.0, + "step": 2620 + }, + { + "epoch": 5.971493728620296, + "grad_norm": 4.53125, + "learning_rate": 1.9122667605724202e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8824938386678696, + "num_tokens": 280329229.0, + "step": 2621 + }, + { + "epoch": 5.9737742303306725, + "grad_norm": 4.375, + "learning_rate": 1.910436266484889e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8779114931821823, + "num_tokens": 280436155.0, + "step": 2622 + }, + { + "epoch": 5.976054732041049, + "grad_norm": 5.21875, + "learning_rate": 1.908606107022351e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8845522552728653, + "num_tokens": 280543656.0, + "step": 2623 + }, + { + "epoch": 5.978335233751425, + "grad_norm": 3.625, + "learning_rate": 1.9067762832235698e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8839434236288071, + "num_tokens": 280651076.0, + "step": 2624 + }, + { + "epoch": 5.980615735461802, + "grad_norm": 4.6875, + "learning_rate": 1.9049467961271184e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.881288930773735, + "num_tokens": 280758080.0, + "step": 2625 + }, + { + "epoch": 5.982896237172178, + "grad_norm": 2.71875, + "learning_rate": 1.9031176467713763e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8865546584129333, + "num_tokens": 280865575.0, + "step": 2626 + }, + { + "epoch": 5.985176738882554, + "grad_norm": 3.921875, + "learning_rate": 1.9012888361945354e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8823821991682053, + "num_tokens": 280972852.0, + "step": 2627 + }, + { + "epoch": 5.9874572405929305, + "grad_norm": 4.90625, + "learning_rate": 1.8994603654345917e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8786684423685074, + "num_tokens": 281079620.0, + "step": 2628 + }, + { + "epoch": 5.989737742303307, + "grad_norm": 4.9375, + "learning_rate": 1.897632235529351e-06, + "loss": 0.5979, + "mean_token_accuracy": 0.8792405128479004, + "num_tokens": 281186453.0, + "step": 2629 + }, + { + "epoch": 5.992018244013683, + "grad_norm": 2.84375, + "learning_rate": 1.8958044475164242e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.8830309361219406, + "num_tokens": 281293819.0, + "step": 2630 + }, + { + "epoch": 5.994298745724059, + "grad_norm": 5.46875, + "learning_rate": 1.8939770024332294e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8879213184118271, + "num_tokens": 281400651.0, + "step": 2631 + }, + { + "epoch": 5.996579247434435, + "grad_norm": 2.875, + "learning_rate": 1.8921499013169876e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8807700723409653, + "num_tokens": 281507623.0, + "step": 2632 + }, + { + "epoch": 5.9988597491448115, + "grad_norm": 2.65625, + "learning_rate": 1.8903231452047265e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8816178441047668, + "num_tokens": 281614487.0, + "step": 2633 + }, + { + "epoch": 6.0, + "grad_norm": 5.5625, + "learning_rate": 1.8884967351332778e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.8762392401695251, + "num_tokens": 281653392.0, + "step": 2634 + }, + { + "epoch": 6.002280501710376, + "grad_norm": 2.734375, + "learning_rate": 1.886670672139277e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.8846362233161926, + "num_tokens": 281760658.0, + "step": 2635 + }, + { + "epoch": 6.004561003420752, + "grad_norm": 2.59375, + "learning_rate": 1.884844957259163e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8855700343847275, + "num_tokens": 281867681.0, + "step": 2636 + }, + { + "epoch": 6.006841505131129, + "grad_norm": 5.15625, + "learning_rate": 1.8830195915291741e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8807573318481445, + "num_tokens": 281974677.0, + "step": 2637 + }, + { + "epoch": 6.009122006841505, + "grad_norm": 3.09375, + "learning_rate": 1.8811945759853543e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.8797610551118851, + "num_tokens": 282081659.0, + "step": 2638 + }, + { + "epoch": 6.011402508551882, + "grad_norm": 3.234375, + "learning_rate": 1.879369911663546e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.881444662809372, + "num_tokens": 282188705.0, + "step": 2639 + }, + { + "epoch": 6.013683010262258, + "grad_norm": 2.765625, + "learning_rate": 1.8775455995993941e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.8842919617891312, + "num_tokens": 282295789.0, + "step": 2640 + }, + { + "epoch": 6.013683010262258, + "eval_loss": 0.5869334936141968, + "eval_mean_token_accuracy": 0.8798886684863739, + "eval_num_tokens": 282295789.0, + "eval_runtime": 58.6353, + "eval_samples_per_second": 143.003, + "eval_steps_per_second": 4.485, + "step": 2640 + }, + { + "epoch": 6.015963511972634, + "grad_norm": 2.9375, + "learning_rate": 1.875721640828344e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8848172128200531, + "num_tokens": 282402500.0, + "step": 2641 + }, + { + "epoch": 6.01824401368301, + "grad_norm": 3.71875, + "learning_rate": 1.8738980363856376e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8776390701532364, + "num_tokens": 282509194.0, + "step": 2642 + }, + { + "epoch": 6.020524515393387, + "grad_norm": 3.09375, + "learning_rate": 1.8720747873063184e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8839791864156723, + "num_tokens": 282616314.0, + "step": 2643 + }, + { + "epoch": 6.022805017103763, + "grad_norm": 2.78125, + "learning_rate": 1.870251894625227e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8833863288164139, + "num_tokens": 282723232.0, + "step": 2644 + }, + { + "epoch": 6.025085518814139, + "grad_norm": 3.140625, + "learning_rate": 1.8684293593770026e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8824484199285507, + "num_tokens": 282829758.0, + "step": 2645 + }, + { + "epoch": 6.027366020524515, + "grad_norm": 3.34375, + "learning_rate": 1.866607182596081e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.8822183609008789, + "num_tokens": 282936677.0, + "step": 2646 + }, + { + "epoch": 6.029646522234891, + "grad_norm": 2.8125, + "learning_rate": 1.8647853653166953e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8796509057283401, + "num_tokens": 283043715.0, + "step": 2647 + }, + { + "epoch": 6.031927023945268, + "grad_norm": 3.0, + "learning_rate": 1.862963908572872e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.8782380074262619, + "num_tokens": 283150407.0, + "step": 2648 + }, + { + "epoch": 6.034207525655645, + "grad_norm": 2.8125, + "learning_rate": 1.8611428133984365e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8795290291309357, + "num_tokens": 283257249.0, + "step": 2649 + }, + { + "epoch": 6.036488027366021, + "grad_norm": 3.84375, + "learning_rate": 1.8593220808270057e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.881279394030571, + "num_tokens": 283363971.0, + "step": 2650 + }, + { + "epoch": 6.038768529076397, + "grad_norm": 4.78125, + "learning_rate": 1.857501711891993e-06, + "loss": 0.552, + "mean_token_accuracy": 0.8844835609197617, + "num_tokens": 283471402.0, + "step": 2651 + }, + { + "epoch": 6.041049030786773, + "grad_norm": 3.828125, + "learning_rate": 1.8556817076266059e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8844869136810303, + "num_tokens": 283578013.0, + "step": 2652 + }, + { + "epoch": 6.043329532497149, + "grad_norm": 2.84375, + "learning_rate": 1.8538620690638414e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8868309855461121, + "num_tokens": 283685194.0, + "step": 2653 + }, + { + "epoch": 6.045610034207526, + "grad_norm": 2.890625, + "learning_rate": 1.8520427972364924e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8806286752223969, + "num_tokens": 283792579.0, + "step": 2654 + }, + { + "epoch": 6.047890535917902, + "grad_norm": 3.953125, + "learning_rate": 1.8502238931771422e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.884283035993576, + "num_tokens": 283899492.0, + "step": 2655 + }, + { + "epoch": 6.050171037628278, + "grad_norm": 2.8125, + "learning_rate": 1.848405357918166e-06, + "loss": 0.596, + "mean_token_accuracy": 0.8780128657817841, + "num_tokens": 284006120.0, + "step": 2656 + }, + { + "epoch": 6.052451539338654, + "grad_norm": 3.140625, + "learning_rate": 1.8465871924917295e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.8742113560438156, + "num_tokens": 284112587.0, + "step": 2657 + }, + { + "epoch": 6.05473204104903, + "grad_norm": 2.65625, + "learning_rate": 1.8447693979297882e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.8869588524103165, + "num_tokens": 284219902.0, + "step": 2658 + }, + { + "epoch": 6.0570125427594075, + "grad_norm": 3.765625, + "learning_rate": 1.8429519752640862e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8819890022277832, + "num_tokens": 284327196.0, + "step": 2659 + }, + { + "epoch": 6.059293044469784, + "grad_norm": 2.4375, + "learning_rate": 1.8411349255261587e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8833761811256409, + "num_tokens": 284434704.0, + "step": 2660 + }, + { + "epoch": 6.06157354618016, + "grad_norm": 5.125, + "learning_rate": 1.8393182497473271e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8822661340236664, + "num_tokens": 284542263.0, + "step": 2661 + }, + { + "epoch": 6.063854047890536, + "grad_norm": 3.875, + "learning_rate": 1.837501948958702e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8850172013044357, + "num_tokens": 284649568.0, + "step": 2662 + }, + { + "epoch": 6.066134549600912, + "grad_norm": 2.65625, + "learning_rate": 1.8356860241911817e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8816670328378677, + "num_tokens": 284757479.0, + "step": 2663 + }, + { + "epoch": 6.068415051311288, + "grad_norm": 3.53125, + "learning_rate": 1.833870476475448e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8798322230577469, + "num_tokens": 284864552.0, + "step": 2664 + }, + { + "epoch": 6.070695553021665, + "grad_norm": 4.8125, + "learning_rate": 1.8320553068419716e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8766209781169891, + "num_tokens": 284971361.0, + "step": 2665 + }, + { + "epoch": 6.072976054732041, + "grad_norm": 3.3125, + "learning_rate": 1.830240516321008e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8826578855514526, + "num_tokens": 285078270.0, + "step": 2666 + }, + { + "epoch": 6.075256556442417, + "grad_norm": 4.03125, + "learning_rate": 1.8284261059425972e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.8815598785877228, + "num_tokens": 285185441.0, + "step": 2667 + }, + { + "epoch": 6.077537058152793, + "grad_norm": 3.046875, + "learning_rate": 1.8266120767365642e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.8829776495695114, + "num_tokens": 285291883.0, + "step": 2668 + }, + { + "epoch": 6.07981755986317, + "grad_norm": 4.59375, + "learning_rate": 1.8247984297325156e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8801479190587997, + "num_tokens": 285398806.0, + "step": 2669 + }, + { + "epoch": 6.0820980615735465, + "grad_norm": 3.25, + "learning_rate": 1.8229851659598425e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8817605227231979, + "num_tokens": 285505881.0, + "step": 2670 + }, + { + "epoch": 6.084378563283923, + "grad_norm": 3.125, + "learning_rate": 1.8211722864477197e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8797835558652878, + "num_tokens": 285612626.0, + "step": 2671 + }, + { + "epoch": 6.086659064994299, + "grad_norm": 3.546875, + "learning_rate": 1.819359792225101e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8806306272745132, + "num_tokens": 285720025.0, + "step": 2672 + }, + { + "epoch": 6.088939566704675, + "grad_norm": 3.109375, + "learning_rate": 1.8175476843207245e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8841009885072708, + "num_tokens": 285827144.0, + "step": 2673 + }, + { + "epoch": 6.091220068415051, + "grad_norm": 2.734375, + "learning_rate": 1.8157359637631078e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8842537254095078, + "num_tokens": 285933624.0, + "step": 2674 + }, + { + "epoch": 6.0935005701254275, + "grad_norm": 3.421875, + "learning_rate": 1.813924631580547e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.881016343832016, + "num_tokens": 286040489.0, + "step": 2675 + }, + { + "epoch": 6.095781071835804, + "grad_norm": 2.921875, + "learning_rate": 1.8121136888011198e-06, + "loss": 0.5817, + "mean_token_accuracy": 0.8789900839328766, + "num_tokens": 286146914.0, + "step": 2676 + }, + { + "epoch": 6.09806157354618, + "grad_norm": 4.15625, + "learning_rate": 1.810303136452683e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8830416947603226, + "num_tokens": 286254159.0, + "step": 2677 + }, + { + "epoch": 6.100342075256556, + "grad_norm": 2.546875, + "learning_rate": 1.8084929755628707e-06, + "loss": 0.5559, + "mean_token_accuracy": 0.8868089914321899, + "num_tokens": 286361203.0, + "step": 2678 + }, + { + "epoch": 6.102622576966933, + "grad_norm": 2.890625, + "learning_rate": 1.8066832071590967e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.8792434334754944, + "num_tokens": 286468526.0, + "step": 2679 + }, + { + "epoch": 6.104903078677309, + "grad_norm": 3.140625, + "learning_rate": 1.8048738322685478e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8830481469631195, + "num_tokens": 286575282.0, + "step": 2680 + }, + { + "epoch": 6.1071835803876855, + "grad_norm": 3.0625, + "learning_rate": 1.8030648519181926e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8848658800125122, + "num_tokens": 286682350.0, + "step": 2681 + }, + { + "epoch": 6.109464082098062, + "grad_norm": 2.609375, + "learning_rate": 1.8012562671347721e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8794921636581421, + "num_tokens": 286789153.0, + "step": 2682 + }, + { + "epoch": 6.111744583808438, + "grad_norm": 4.34375, + "learning_rate": 1.7994480789448043e-06, + "loss": 0.598, + "mean_token_accuracy": 0.877688080072403, + "num_tokens": 286896549.0, + "step": 2683 + }, + { + "epoch": 6.114025085518814, + "grad_norm": 2.546875, + "learning_rate": 1.7976402883745836e-06, + "loss": 0.5455, + "mean_token_accuracy": 0.8872903436422348, + "num_tokens": 287003999.0, + "step": 2684 + }, + { + "epoch": 6.11630558722919, + "grad_norm": 2.6875, + "learning_rate": 1.7958328964501749e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8816230744123459, + "num_tokens": 287110496.0, + "step": 2685 + }, + { + "epoch": 6.1185860889395665, + "grad_norm": 3.46875, + "learning_rate": 1.7940259041974189e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8829924911260605, + "num_tokens": 287217513.0, + "step": 2686 + }, + { + "epoch": 6.120866590649943, + "grad_norm": 4.28125, + "learning_rate": 1.7922193126419306e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8832157105207443, + "num_tokens": 287324344.0, + "step": 2687 + }, + { + "epoch": 6.123147092360319, + "grad_norm": 4.75, + "learning_rate": 1.7904131228090965e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8847935795783997, + "num_tokens": 287431380.0, + "step": 2688 + }, + { + "epoch": 6.125427594070696, + "grad_norm": 3.84375, + "learning_rate": 1.7886073357240746e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8844207972288132, + "num_tokens": 287538513.0, + "step": 2689 + }, + { + "epoch": 6.127708095781072, + "grad_norm": 3.109375, + "learning_rate": 1.7868019524117957e-06, + "loss": 0.5821, + "mean_token_accuracy": 0.8769582509994507, + "num_tokens": 287645242.0, + "step": 2690 + }, + { + "epoch": 6.129988597491448, + "grad_norm": 3.375, + "learning_rate": 1.7849969738969592e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8752488493919373, + "num_tokens": 287751854.0, + "step": 2691 + }, + { + "epoch": 6.1322690992018245, + "grad_norm": 2.765625, + "learning_rate": 1.783192401204037e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8854573518037796, + "num_tokens": 287858938.0, + "step": 2692 + }, + { + "epoch": 6.134549600912201, + "grad_norm": 4.34375, + "learning_rate": 1.7813882353572692e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8819147795438766, + "num_tokens": 287965788.0, + "step": 2693 + }, + { + "epoch": 6.136830102622577, + "grad_norm": 3.484375, + "learning_rate": 1.7795844773806653e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.884269118309021, + "num_tokens": 288073071.0, + "step": 2694 + }, + { + "epoch": 6.139110604332953, + "grad_norm": 3.390625, + "learning_rate": 1.7777811282980047e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8846654891967773, + "num_tokens": 288180362.0, + "step": 2695 + }, + { + "epoch": 6.141391106043329, + "grad_norm": 3.4375, + "learning_rate": 1.7759781891328321e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8841256648302078, + "num_tokens": 288287566.0, + "step": 2696 + }, + { + "epoch": 6.1436716077537055, + "grad_norm": 5.59375, + "learning_rate": 1.7741756609084616e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.8848319351673126, + "num_tokens": 288395039.0, + "step": 2697 + }, + { + "epoch": 6.145952109464082, + "grad_norm": 3.875, + "learning_rate": 1.772373544647973e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8844243884086609, + "num_tokens": 288502651.0, + "step": 2698 + }, + { + "epoch": 6.148232611174459, + "grad_norm": 3.234375, + "learning_rate": 1.770571841374213e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.882807731628418, + "num_tokens": 288609620.0, + "step": 2699 + }, + { + "epoch": 6.150513112884835, + "grad_norm": 3.828125, + "learning_rate": 1.7687705521097954e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8844881951808929, + "num_tokens": 288716986.0, + "step": 2700 + }, + { + "epoch": 6.152793614595211, + "grad_norm": 3.296875, + "learning_rate": 1.766969677877094e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8841705173254013, + "num_tokens": 288824522.0, + "step": 2701 + }, + { + "epoch": 6.155074116305587, + "grad_norm": 3.140625, + "learning_rate": 1.7651692196982517e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8823985755443573, + "num_tokens": 288931504.0, + "step": 2702 + }, + { + "epoch": 6.1573546180159635, + "grad_norm": 3.234375, + "learning_rate": 1.7633691785951746e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8812411278486252, + "num_tokens": 289038321.0, + "step": 2703 + }, + { + "epoch": 6.15963511972634, + "grad_norm": 3.28125, + "learning_rate": 1.7615695555895296e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8865274041891098, + "num_tokens": 289145591.0, + "step": 2704 + }, + { + "epoch": 6.161915621436716, + "grad_norm": 3.65625, + "learning_rate": 1.7597703517027491e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8811384439468384, + "num_tokens": 289252364.0, + "step": 2705 + }, + { + "epoch": 6.164196123147092, + "grad_norm": 2.5625, + "learning_rate": 1.7579715679560273e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8845842778682709, + "num_tokens": 289359378.0, + "step": 2706 + }, + { + "epoch": 6.166476624857468, + "grad_norm": 3.234375, + "learning_rate": 1.7561732053703174e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8825568854808807, + "num_tokens": 289466858.0, + "step": 2707 + }, + { + "epoch": 6.168757126567845, + "grad_norm": 4.15625, + "learning_rate": 1.7543752649663354e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8844540864229202, + "num_tokens": 289574570.0, + "step": 2708 + }, + { + "epoch": 6.1710376282782216, + "grad_norm": 2.984375, + "learning_rate": 1.7525777477645586e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8892973810434341, + "num_tokens": 289682543.0, + "step": 2709 + }, + { + "epoch": 6.173318129988598, + "grad_norm": 5.4375, + "learning_rate": 1.7507806547852224e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.885601207613945, + "num_tokens": 289789759.0, + "step": 2710 + }, + { + "epoch": 6.175598631698974, + "grad_norm": 3.671875, + "learning_rate": 1.7489839870483236e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.885807454586029, + "num_tokens": 289897075.0, + "step": 2711 + }, + { + "epoch": 6.17787913340935, + "grad_norm": 3.34375, + "learning_rate": 1.7471877455736136e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.8810791075229645, + "num_tokens": 290004392.0, + "step": 2712 + }, + { + "epoch": 6.180159635119726, + "grad_norm": 3.046875, + "learning_rate": 1.7453919313806057e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.880391463637352, + "num_tokens": 290111469.0, + "step": 2713 + }, + { + "epoch": 6.1824401368301025, + "grad_norm": 4.03125, + "learning_rate": 1.7435965454885699e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.8832577913999557, + "num_tokens": 290218539.0, + "step": 2714 + }, + { + "epoch": 6.184720638540479, + "grad_norm": 3.546875, + "learning_rate": 1.7418015889165312e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8896812498569489, + "num_tokens": 290326424.0, + "step": 2715 + }, + { + "epoch": 6.187001140250855, + "grad_norm": 3.8125, + "learning_rate": 1.7400070626832732e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8802052289247513, + "num_tokens": 290433540.0, + "step": 2716 + }, + { + "epoch": 6.189281641961231, + "grad_norm": 4.34375, + "learning_rate": 1.7382129678073351e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.878139853477478, + "num_tokens": 290540729.0, + "step": 2717 + }, + { + "epoch": 6.191562143671608, + "grad_norm": 4.21875, + "learning_rate": 1.7364193053070082e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8827469646930695, + "num_tokens": 290647100.0, + "step": 2718 + }, + { + "epoch": 6.193842645381984, + "grad_norm": 4.1875, + "learning_rate": 1.7346260762003428e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8845665007829666, + "num_tokens": 290753980.0, + "step": 2719 + }, + { + "epoch": 6.196123147092361, + "grad_norm": 5.40625, + "learning_rate": 1.7328332815051403e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8774483948945999, + "num_tokens": 290861484.0, + "step": 2720 + }, + { + "epoch": 6.198403648802737, + "grad_norm": 3.171875, + "learning_rate": 1.7310409222389563e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8843204379081726, + "num_tokens": 290968606.0, + "step": 2721 + }, + { + "epoch": 6.200684150513113, + "grad_norm": 3.546875, + "learning_rate": 1.7292489994191005e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8810103088617325, + "num_tokens": 291075885.0, + "step": 2722 + }, + { + "epoch": 6.202964652223489, + "grad_norm": 2.75, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8839705884456635, + "num_tokens": 291183303.0, + "step": 2723 + }, + { + "epoch": 6.205245153933865, + "grad_norm": 3.1875, + "learning_rate": 1.7256664671863634e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8867494761943817, + "num_tokens": 291290745.0, + "step": 2724 + }, + { + "epoch": 6.2075256556442415, + "grad_norm": 5.1875, + "learning_rate": 1.72387585980686e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.8760453164577484, + "num_tokens": 291397663.0, + "step": 2725 + }, + { + "epoch": 6.209806157354618, + "grad_norm": 5.1875, + "learning_rate": 1.7220856929404342e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.8818619549274445, + "num_tokens": 291504707.0, + "step": 2726 + }, + { + "epoch": 6.212086659064994, + "grad_norm": 3.390625, + "learning_rate": 1.720295967603152e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8842518627643585, + "num_tokens": 291611837.0, + "step": 2727 + }, + { + "epoch": 6.214367160775371, + "grad_norm": 5.78125, + "learning_rate": 1.7185066848108244e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8826566338539124, + "num_tokens": 291718969.0, + "step": 2728 + }, + { + "epoch": 6.216647662485747, + "grad_norm": 4.59375, + "learning_rate": 1.7167178455790157e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8848382532596588, + "num_tokens": 291826377.0, + "step": 2729 + }, + { + "epoch": 6.218928164196123, + "grad_norm": 3.6875, + "learning_rate": 1.7149294509230357e-06, + "loss": 0.5986, + "mean_token_accuracy": 0.8756033033132553, + "num_tokens": 291933188.0, + "step": 2730 + }, + { + "epoch": 6.2212086659065, + "grad_norm": 2.84375, + "learning_rate": 1.713141501857943e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8862540572881699, + "num_tokens": 292040480.0, + "step": 2731 + }, + { + "epoch": 6.223489167616876, + "grad_norm": 3.765625, + "learning_rate": 1.7113539993985431e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8784683644771576, + "num_tokens": 292147117.0, + "step": 2732 + }, + { + "epoch": 6.225769669327252, + "grad_norm": 5.1875, + "learning_rate": 1.7095669445593887e-06, + "loss": 0.55, + "mean_token_accuracy": 0.8856497257947922, + "num_tokens": 292254295.0, + "step": 2733 + }, + { + "epoch": 6.228050171037628, + "grad_norm": 3.203125, + "learning_rate": 1.707780338354776e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8842949420213699, + "num_tokens": 292361287.0, + "step": 2734 + }, + { + "epoch": 6.230330672748004, + "grad_norm": 2.984375, + "learning_rate": 1.7059941817987485e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8810069411993027, + "num_tokens": 292468510.0, + "step": 2735 + }, + { + "epoch": 6.2326111744583805, + "grad_norm": 2.921875, + "learning_rate": 1.7042084759050948e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8827442526817322, + "num_tokens": 292575697.0, + "step": 2736 + }, + { + "epoch": 6.234891676168757, + "grad_norm": 2.53125, + "learning_rate": 1.7024232216873465e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8848889619112015, + "num_tokens": 292682969.0, + "step": 2737 + }, + { + "epoch": 6.237172177879134, + "grad_norm": 4.84375, + "learning_rate": 1.7006384201587809e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.884590744972229, + "num_tokens": 292790103.0, + "step": 2738 + }, + { + "epoch": 6.23945267958951, + "grad_norm": 3.109375, + "learning_rate": 1.6988540723324145e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.881315678358078, + "num_tokens": 292897356.0, + "step": 2739 + }, + { + "epoch": 6.241733181299886, + "grad_norm": 3.546875, + "learning_rate": 1.6970701792210101e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8851087540388107, + "num_tokens": 293004918.0, + "step": 2740 + }, + { + "epoch": 6.244013683010262, + "grad_norm": 3.1875, + "learning_rate": 1.6952867418370707e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8804485946893692, + "num_tokens": 293112328.0, + "step": 2741 + }, + { + "epoch": 6.246294184720639, + "grad_norm": 5.96875, + "learning_rate": 1.6935037611928412e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8765440136194229, + "num_tokens": 293218993.0, + "step": 2742 + }, + { + "epoch": 6.248574686431015, + "grad_norm": 5.1875, + "learning_rate": 1.691721238300308e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.882225289940834, + "num_tokens": 293325757.0, + "step": 2743 + }, + { + "epoch": 6.250855188141391, + "grad_norm": 3.265625, + "learning_rate": 1.689939174171194e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.876752108335495, + "num_tokens": 293432687.0, + "step": 2744 + }, + { + "epoch": 6.253135689851767, + "grad_norm": 3.5625, + "learning_rate": 1.6881575698169662e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8828807026147842, + "num_tokens": 293540213.0, + "step": 2745 + }, + { + "epoch": 6.255416191562143, + "grad_norm": 2.71875, + "learning_rate": 1.6863764262488292e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8817939162254333, + "num_tokens": 293647497.0, + "step": 2746 + }, + { + "epoch": 6.2576966932725195, + "grad_norm": 2.828125, + "learning_rate": 1.6845957444777244e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8802742511034012, + "num_tokens": 293754862.0, + "step": 2747 + }, + { + "epoch": 6.259977194982897, + "grad_norm": 2.640625, + "learning_rate": 1.6828155255143331e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.882528692483902, + "num_tokens": 293861700.0, + "step": 2748 + }, + { + "epoch": 6.262257696693273, + "grad_norm": 6.125, + "learning_rate": 1.6810357703690739e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8790639489889145, + "num_tokens": 293969027.0, + "step": 2749 + }, + { + "epoch": 6.264538198403649, + "grad_norm": 2.734375, + "learning_rate": 1.6792564800521e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8794273883104324, + "num_tokens": 294075857.0, + "step": 2750 + }, + { + "epoch": 6.266818700114025, + "grad_norm": 3.046875, + "learning_rate": 1.677477655573303e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.8795068562030792, + "num_tokens": 294182659.0, + "step": 2751 + }, + { + "epoch": 6.269099201824401, + "grad_norm": 3.796875, + "learning_rate": 1.675699297942309e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8815376460552216, + "num_tokens": 294289557.0, + "step": 2752 + }, + { + "epoch": 6.271379703534778, + "grad_norm": 3.0, + "learning_rate": 1.6739214081684799e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.876123234629631, + "num_tokens": 294396897.0, + "step": 2753 + }, + { + "epoch": 6.273660205245154, + "grad_norm": 2.65625, + "learning_rate": 1.6721439872609125e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.8840298056602478, + "num_tokens": 294504720.0, + "step": 2754 + }, + { + "epoch": 6.27594070695553, + "grad_norm": 6.96875, + "learning_rate": 1.6703670362284346e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8847046941518784, + "num_tokens": 294611688.0, + "step": 2755 + }, + { + "epoch": 6.278221208665906, + "grad_norm": 3.8125, + "learning_rate": 1.6685905560796101e-06, + "loss": 0.584, + "mean_token_accuracy": 0.878638818860054, + "num_tokens": 294718635.0, + "step": 2756 + }, + { + "epoch": 6.280501710376283, + "grad_norm": 3.203125, + "learning_rate": 1.6668145478227354e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8829765170812607, + "num_tokens": 294825331.0, + "step": 2757 + }, + { + "epoch": 6.282782212086659, + "grad_norm": 3.921875, + "learning_rate": 1.6650390124658378e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.878277525305748, + "num_tokens": 294932357.0, + "step": 2758 + }, + { + "epoch": 6.285062713797036, + "grad_norm": 2.9375, + "learning_rate": 1.663263951016678e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8813124597072601, + "num_tokens": 295039173.0, + "step": 2759 + }, + { + "epoch": 6.287343215507412, + "grad_norm": 4.28125, + "learning_rate": 1.661489364482745e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8825227618217468, + "num_tokens": 295146182.0, + "step": 2760 + }, + { + "epoch": 6.289623717217788, + "grad_norm": 3.015625, + "learning_rate": 1.6597152538712608e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8797194510698318, + "num_tokens": 295253659.0, + "step": 2761 + }, + { + "epoch": 6.291904218928164, + "grad_norm": 2.765625, + "learning_rate": 1.6579416201891757e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.8829602301120758, + "num_tokens": 295361105.0, + "step": 2762 + }, + { + "epoch": 6.29418472063854, + "grad_norm": 5.4375, + "learning_rate": 1.6561684644431709e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8760716319084167, + "num_tokens": 295467879.0, + "step": 2763 + }, + { + "epoch": 6.296465222348917, + "grad_norm": 4.25, + "learning_rate": 1.6543957876396544e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8809865862131119, + "num_tokens": 295575043.0, + "step": 2764 + }, + { + "epoch": 6.298745724059293, + "grad_norm": 4.40625, + "learning_rate": 1.6526235907847649e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8808043301105499, + "num_tokens": 295682338.0, + "step": 2765 + }, + { + "epoch": 6.301026225769669, + "grad_norm": 3.0625, + "learning_rate": 1.6508518748843651e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8840913772583008, + "num_tokens": 295789718.0, + "step": 2766 + }, + { + "epoch": 6.303306727480045, + "grad_norm": 2.9375, + "learning_rate": 1.649080640944048e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8826287239789963, + "num_tokens": 295897773.0, + "step": 2767 + }, + { + "epoch": 6.305587229190422, + "grad_norm": 3.28125, + "learning_rate": 1.6473098899691313e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8791597187519073, + "num_tokens": 296004423.0, + "step": 2768 + }, + { + "epoch": 6.307867730900798, + "grad_norm": 4.0, + "learning_rate": 1.6455396229646595e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8822521567344666, + "num_tokens": 296111701.0, + "step": 2769 + }, + { + "epoch": 6.310148232611175, + "grad_norm": 2.6875, + "learning_rate": 1.6437698409354025e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.887429729104042, + "num_tokens": 296218964.0, + "step": 2770 + }, + { + "epoch": 6.312428734321551, + "grad_norm": 2.921875, + "learning_rate": 1.6420005448858522e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8810682147741318, + "num_tokens": 296325719.0, + "step": 2771 + }, + { + "epoch": 6.314709236031927, + "grad_norm": 4.21875, + "learning_rate": 1.6402317358202286e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8821781575679779, + "num_tokens": 296432464.0, + "step": 2772 + }, + { + "epoch": 6.316989737742303, + "grad_norm": 4.125, + "learning_rate": 1.6384634147424732e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8829574584960938, + "num_tokens": 296539951.0, + "step": 2773 + }, + { + "epoch": 6.319270239452679, + "grad_norm": 2.71875, + "learning_rate": 1.636695582656251e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8789397776126862, + "num_tokens": 296646664.0, + "step": 2774 + }, + { + "epoch": 6.321550741163056, + "grad_norm": 3.359375, + "learning_rate": 1.6349282405649506e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.889549657702446, + "num_tokens": 296754082.0, + "step": 2775 + }, + { + "epoch": 6.323831242873432, + "grad_norm": 3.6875, + "learning_rate": 1.6331613894716787e-06, + "loss": 0.5546, + "mean_token_accuracy": 0.8843847513198853, + "num_tokens": 296861010.0, + "step": 2776 + }, + { + "epoch": 6.326111744583809, + "grad_norm": 6.21875, + "learning_rate": 1.6313950303792672e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8804360330104828, + "num_tokens": 296968168.0, + "step": 2777 + }, + { + "epoch": 6.328392246294185, + "grad_norm": 2.796875, + "learning_rate": 1.6296291642902673e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8830510526895523, + "num_tokens": 297075136.0, + "step": 2778 + }, + { + "epoch": 6.330672748004561, + "grad_norm": 3.046875, + "learning_rate": 1.6278637922069512e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.8757496923208237, + "num_tokens": 297182280.0, + "step": 2779 + }, + { + "epoch": 6.3329532497149374, + "grad_norm": 4.125, + "learning_rate": 1.6260989151313091e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8829326927661896, + "num_tokens": 297288826.0, + "step": 2780 + }, + { + "epoch": 6.335233751425314, + "grad_norm": 3.59375, + "learning_rate": 1.6243345340650523e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8820019513368607, + "num_tokens": 297395720.0, + "step": 2781 + }, + { + "epoch": 6.33751425313569, + "grad_norm": 2.53125, + "learning_rate": 1.6225706500096079e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8875852972269058, + "num_tokens": 297502615.0, + "step": 2782 + }, + { + "epoch": 6.339794754846066, + "grad_norm": 2.875, + "learning_rate": 1.6208072639661226e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8859220743179321, + "num_tokens": 297609799.0, + "step": 2783 + }, + { + "epoch": 6.342075256556442, + "grad_norm": 3.03125, + "learning_rate": 1.6190443769354608e-06, + "loss": 0.5932, + "mean_token_accuracy": 0.8765744715929031, + "num_tokens": 297716620.0, + "step": 2784 + }, + { + "epoch": 6.344355758266818, + "grad_norm": 2.84375, + "learning_rate": 1.6172819899182036e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8797289431095123, + "num_tokens": 297823497.0, + "step": 2785 + }, + { + "epoch": 6.346636259977195, + "grad_norm": 4.9375, + "learning_rate": 1.6155201039146478e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.876724436879158, + "num_tokens": 297930683.0, + "step": 2786 + }, + { + "epoch": 6.348916761687571, + "grad_norm": 3.75, + "learning_rate": 1.613758719924805e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8765078336000443, + "num_tokens": 298037433.0, + "step": 2787 + }, + { + "epoch": 6.351197263397948, + "grad_norm": 2.640625, + "learning_rate": 1.611997838948403e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8840204477310181, + "num_tokens": 298145428.0, + "step": 2788 + }, + { + "epoch": 6.353477765108324, + "grad_norm": 4.71875, + "learning_rate": 1.6102374619848845e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.8840190768241882, + "num_tokens": 298252510.0, + "step": 2789 + }, + { + "epoch": 6.3557582668187, + "grad_norm": 2.640625, + "learning_rate": 1.6084775900334046e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.8814188688993454, + "num_tokens": 298359365.0, + "step": 2790 + }, + { + "epoch": 6.3580387685290765, + "grad_norm": 2.953125, + "learning_rate": 1.6067182240928332e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8799934387207031, + "num_tokens": 298466492.0, + "step": 2791 + }, + { + "epoch": 6.360319270239453, + "grad_norm": 2.96875, + "learning_rate": 1.6049593651617534e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.8819076120853424, + "num_tokens": 298573579.0, + "step": 2792 + }, + { + "epoch": 6.362599771949829, + "grad_norm": 3.21875, + "learning_rate": 1.6032010142384572e-06, + "loss": 0.575, + "mean_token_accuracy": 0.880363255739212, + "num_tokens": 298680688.0, + "step": 2793 + }, + { + "epoch": 6.364880273660205, + "grad_norm": 3.28125, + "learning_rate": 1.6014431723209522e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8782119452953339, + "num_tokens": 298787550.0, + "step": 2794 + }, + { + "epoch": 6.367160775370581, + "grad_norm": 3.3125, + "learning_rate": 1.599685840406955e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8798055797815323, + "num_tokens": 298895010.0, + "step": 2795 + }, + { + "epoch": 6.369441277080957, + "grad_norm": 3.59375, + "learning_rate": 1.5979290194938938e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8793617337942123, + "num_tokens": 299001708.0, + "step": 2796 + }, + { + "epoch": 6.3717217787913345, + "grad_norm": 3.015625, + "learning_rate": 1.5961727105789072e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8799160867929459, + "num_tokens": 299108766.0, + "step": 2797 + }, + { + "epoch": 6.374002280501711, + "grad_norm": 3.375, + "learning_rate": 1.5944169146588395e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.8847594708204269, + "num_tokens": 299215718.0, + "step": 2798 + }, + { + "epoch": 6.376282782212087, + "grad_norm": 4.96875, + "learning_rate": 1.5926616327302482e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8823742717504501, + "num_tokens": 299322716.0, + "step": 2799 + }, + { + "epoch": 6.378563283922463, + "grad_norm": 4.6875, + "learning_rate": 1.5909068657893978e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8821296989917755, + "num_tokens": 299429926.0, + "step": 2800 + }, + { + "epoch": 6.380843785632839, + "grad_norm": 2.75, + "learning_rate": 1.5891526148322594e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.8831614851951599, + "num_tokens": 299537185.0, + "step": 2801 + }, + { + "epoch": 6.3831242873432155, + "grad_norm": 3.125, + "learning_rate": 1.5873988808545127e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8860654383897781, + "num_tokens": 299644399.0, + "step": 2802 + }, + { + "epoch": 6.385404789053592, + "grad_norm": 2.609375, + "learning_rate": 1.5856456648515425e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8893062770366669, + "num_tokens": 299751766.0, + "step": 2803 + }, + { + "epoch": 6.387685290763968, + "grad_norm": 2.5, + "learning_rate": 1.5838929678184405e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.880397379398346, + "num_tokens": 299857907.0, + "step": 2804 + }, + { + "epoch": 6.389965792474344, + "grad_norm": 2.953125, + "learning_rate": 1.5821407907500036e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8866594880819321, + "num_tokens": 299965673.0, + "step": 2805 + }, + { + "epoch": 6.39224629418472, + "grad_norm": 3.078125, + "learning_rate": 1.5803891346407342e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.8857071846723557, + "num_tokens": 300072879.0, + "step": 2806 + }, + { + "epoch": 6.394526795895097, + "grad_norm": 2.8125, + "learning_rate": 1.5786380004848379e-06, + "loss": 0.582, + "mean_token_accuracy": 0.8817501962184906, + "num_tokens": 300180006.0, + "step": 2807 + }, + { + "epoch": 6.3968072976054735, + "grad_norm": 2.6875, + "learning_rate": 1.576887389276226e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8835788518190384, + "num_tokens": 300287280.0, + "step": 2808 + }, + { + "epoch": 6.39908779931585, + "grad_norm": 2.9375, + "learning_rate": 1.5751373020085093e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8786880671977997, + "num_tokens": 300394445.0, + "step": 2809 + }, + { + "epoch": 6.401368301026226, + "grad_norm": 3.046875, + "learning_rate": 1.5733877396750051e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8815035223960876, + "num_tokens": 300501045.0, + "step": 2810 + }, + { + "epoch": 6.403648802736602, + "grad_norm": 2.53125, + "learning_rate": 1.5716387032687314e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.883103683590889, + "num_tokens": 300607844.0, + "step": 2811 + }, + { + "epoch": 6.405929304446978, + "grad_norm": 2.8125, + "learning_rate": 1.5698901937824066e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.8773173838853836, + "num_tokens": 300714942.0, + "step": 2812 + }, + { + "epoch": 6.4082098061573545, + "grad_norm": 3.0625, + "learning_rate": 1.5681422122084522e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.8895971029996872, + "num_tokens": 300822920.0, + "step": 2813 + }, + { + "epoch": 6.410490307867731, + "grad_norm": 4.4375, + "learning_rate": 1.5663947595389873e-06, + "loss": 0.5817, + "mean_token_accuracy": 0.8785582333803177, + "num_tokens": 300930457.0, + "step": 2814 + }, + { + "epoch": 6.412770809578107, + "grad_norm": 2.609375, + "learning_rate": 1.5646478367658325e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8812803626060486, + "num_tokens": 301037070.0, + "step": 2815 + }, + { + "epoch": 6.415051311288483, + "grad_norm": 3.09375, + "learning_rate": 1.562901444880508e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.8858920186758041, + "num_tokens": 301144185.0, + "step": 2816 + }, + { + "epoch": 6.41733181299886, + "grad_norm": 2.640625, + "learning_rate": 1.5611555848742318e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8812452852725983, + "num_tokens": 301251360.0, + "step": 2817 + }, + { + "epoch": 6.419612314709236, + "grad_norm": 4.25, + "learning_rate": 1.5594102577379216e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8858301192522049, + "num_tokens": 301358185.0, + "step": 2818 + }, + { + "epoch": 6.4218928164196125, + "grad_norm": 3.515625, + "learning_rate": 1.5576654644621897e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8778669685125351, + "num_tokens": 301464922.0, + "step": 2819 + }, + { + "epoch": 6.424173318129989, + "grad_norm": 2.578125, + "learning_rate": 1.5559212060373474e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8837179839611053, + "num_tokens": 301571945.0, + "step": 2820 + }, + { + "epoch": 6.426453819840365, + "grad_norm": 2.765625, + "learning_rate": 1.5541774834534024e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8790989071130753, + "num_tokens": 301678995.0, + "step": 2821 + }, + { + "epoch": 6.428734321550741, + "grad_norm": 3.46875, + "learning_rate": 1.5524342977000587e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.880190521478653, + "num_tokens": 301786689.0, + "step": 2822 + }, + { + "epoch": 6.431014823261117, + "grad_norm": 3.703125, + "learning_rate": 1.5506916497667134e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8833551555871964, + "num_tokens": 301894003.0, + "step": 2823 + }, + { + "epoch": 6.4332953249714935, + "grad_norm": 2.671875, + "learning_rate": 1.5489495406424618e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8886000365018845, + "num_tokens": 302001595.0, + "step": 2824 + }, + { + "epoch": 6.43557582668187, + "grad_norm": 2.71875, + "learning_rate": 1.5472079713160892e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8830095380544662, + "num_tokens": 302109787.0, + "step": 2825 + }, + { + "epoch": 6.437856328392247, + "grad_norm": 4.0, + "learning_rate": 1.5454669427760774e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8822568953037262, + "num_tokens": 302216211.0, + "step": 2826 + }, + { + "epoch": 6.440136830102623, + "grad_norm": 2.9375, + "learning_rate": 1.5437264560106014e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8839522004127502, + "num_tokens": 302323292.0, + "step": 2827 + }, + { + "epoch": 6.442417331812999, + "grad_norm": 2.75, + "learning_rate": 1.5419865120075267e-06, + "loss": 0.5576, + "mean_token_accuracy": 0.8829583674669266, + "num_tokens": 302430317.0, + "step": 2828 + }, + { + "epoch": 6.444697833523375, + "grad_norm": 3.734375, + "learning_rate": 1.5402471117544143e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.8828336149454117, + "num_tokens": 302537141.0, + "step": 2829 + }, + { + "epoch": 6.4469783352337515, + "grad_norm": 3.1875, + "learning_rate": 1.5385082562385112e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8884585201740265, + "num_tokens": 302644739.0, + "step": 2830 + }, + { + "epoch": 6.449258836944128, + "grad_norm": 2.828125, + "learning_rate": 1.5367699464467596e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8810509741306305, + "num_tokens": 302751800.0, + "step": 2831 + }, + { + "epoch": 6.451539338654504, + "grad_norm": 4.1875, + "learning_rate": 1.5350321833657904e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.8751820474863052, + "num_tokens": 302858532.0, + "step": 2832 + }, + { + "epoch": 6.45381984036488, + "grad_norm": 3.046875, + "learning_rate": 1.5332949679819251e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8875158727169037, + "num_tokens": 302965381.0, + "step": 2833 + }, + { + "epoch": 6.456100342075256, + "grad_norm": 3.6875, + "learning_rate": 1.531558301281173e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8815342336893082, + "num_tokens": 303072747.0, + "step": 2834 + }, + { + "epoch": 6.4583808437856325, + "grad_norm": 2.9375, + "learning_rate": 1.5298221842492328e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8833811283111572, + "num_tokens": 303179347.0, + "step": 2835 + }, + { + "epoch": 6.460661345496009, + "grad_norm": 3.96875, + "learning_rate": 1.5280866178714898e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8801722079515457, + "num_tokens": 303287273.0, + "step": 2836 + }, + { + "epoch": 6.462941847206386, + "grad_norm": 3.453125, + "learning_rate": 1.5263516031330195e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.8790634274482727, + "num_tokens": 303394414.0, + "step": 2837 + }, + { + "epoch": 6.465222348916762, + "grad_norm": 2.890625, + "learning_rate": 1.524617141018582e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.8738131821155548, + "num_tokens": 303501157.0, + "step": 2838 + }, + { + "epoch": 6.467502850627138, + "grad_norm": 2.796875, + "learning_rate": 1.5228832325126248e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.881572037935257, + "num_tokens": 303608294.0, + "step": 2839 + }, + { + "epoch": 6.469783352337514, + "grad_norm": 4.21875, + "learning_rate": 1.5211498785992818e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8818154633045197, + "num_tokens": 303715138.0, + "step": 2840 + }, + { + "epoch": 6.4720638540478905, + "grad_norm": 3.65625, + "learning_rate": 1.5194170802623692e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8801160901784897, + "num_tokens": 303822023.0, + "step": 2841 + }, + { + "epoch": 6.474344355758267, + "grad_norm": 3.484375, + "learning_rate": 1.5176848384853913e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8776775300502777, + "num_tokens": 303929007.0, + "step": 2842 + }, + { + "epoch": 6.476624857468643, + "grad_norm": 2.859375, + "learning_rate": 1.515953154251535e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8842798173427582, + "num_tokens": 304035636.0, + "step": 2843 + }, + { + "epoch": 6.478905359179019, + "grad_norm": 4.53125, + "learning_rate": 1.5142220285436701e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8824753314256668, + "num_tokens": 304142349.0, + "step": 2844 + }, + { + "epoch": 6.481185860889395, + "grad_norm": 4.0, + "learning_rate": 1.512491462344351e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8803005963563919, + "num_tokens": 304249360.0, + "step": 2845 + }, + { + "epoch": 6.483466362599772, + "grad_norm": 2.9375, + "learning_rate": 1.5107614566358136e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8824707418680191, + "num_tokens": 304357003.0, + "step": 2846 + }, + { + "epoch": 6.485746864310149, + "grad_norm": 3.15625, + "learning_rate": 1.5090320123999746e-06, + "loss": 0.5918, + "mean_token_accuracy": 0.8782989233732224, + "num_tokens": 304463367.0, + "step": 2847 + }, + { + "epoch": 6.488027366020525, + "grad_norm": 4.90625, + "learning_rate": 1.5073031306184343e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.882141649723053, + "num_tokens": 304570072.0, + "step": 2848 + }, + { + "epoch": 6.490307867730901, + "grad_norm": 3.15625, + "learning_rate": 1.5055748122724722e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8819586336612701, + "num_tokens": 304677214.0, + "step": 2849 + }, + { + "epoch": 6.492588369441277, + "grad_norm": 2.4375, + "learning_rate": 1.5038470583430485e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.8865179270505905, + "num_tokens": 304784053.0, + "step": 2850 + }, + { + "epoch": 6.494868871151653, + "grad_norm": 3.78125, + "learning_rate": 1.5021198698108038e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8798740953207016, + "num_tokens": 304891175.0, + "step": 2851 + }, + { + "epoch": 6.4971493728620295, + "grad_norm": 6.3125, + "learning_rate": 1.5003932476560554e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8817462623119354, + "num_tokens": 304997679.0, + "step": 2852 + }, + { + "epoch": 6.499429874572406, + "grad_norm": 4.03125, + "learning_rate": 1.4986671928588016e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8805342614650726, + "num_tokens": 305104659.0, + "step": 2853 + }, + { + "epoch": 6.501710376282782, + "grad_norm": 7.0, + "learning_rate": 1.496941706398718e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8827610611915588, + "num_tokens": 305211801.0, + "step": 2854 + }, + { + "epoch": 6.503990877993158, + "grad_norm": 3.4375, + "learning_rate": 1.495216789255156e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8812805563211441, + "num_tokens": 305317978.0, + "step": 2855 + }, + { + "epoch": 6.506271379703534, + "grad_norm": 2.78125, + "learning_rate": 1.4934924424071479e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8763656914234161, + "num_tokens": 305424656.0, + "step": 2856 + }, + { + "epoch": 6.508551881413911, + "grad_norm": 2.71875, + "learning_rate": 1.4917686668333975e-06, + "loss": 0.5881, + "mean_token_accuracy": 0.881349191069603, + "num_tokens": 305531468.0, + "step": 2857 + }, + { + "epoch": 6.510832383124288, + "grad_norm": 6.21875, + "learning_rate": 1.4900454635122866e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8755508661270142, + "num_tokens": 305638201.0, + "step": 2858 + }, + { + "epoch": 6.513112884834664, + "grad_norm": 3.921875, + "learning_rate": 1.4883228334218727e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8790022134780884, + "num_tokens": 305745038.0, + "step": 2859 + }, + { + "epoch": 6.51539338654504, + "grad_norm": 6.1875, + "learning_rate": 1.4866007775398874e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8798090815544128, + "num_tokens": 305852570.0, + "step": 2860 + }, + { + "epoch": 6.51539338654504, + "eval_loss": 0.5866165161132812, + "eval_mean_token_accuracy": 0.8798247088497583, + "eval_num_tokens": 305852570.0, + "eval_runtime": 58.6358, + "eval_samples_per_second": 143.001, + "eval_steps_per_second": 4.485, + "step": 2860 + }, + { + "epoch": 6.517673888255416, + "grad_norm": 2.484375, + "learning_rate": 1.4848792968437376e-06, + "loss": 0.54, + "mean_token_accuracy": 0.888011172413826, + "num_tokens": 305960317.0, + "step": 2861 + }, + { + "epoch": 6.519954389965792, + "grad_norm": 3.234375, + "learning_rate": 1.4831583923105e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8860371559858322, + "num_tokens": 306067692.0, + "step": 2862 + }, + { + "epoch": 6.5222348916761685, + "grad_norm": 2.921875, + "learning_rate": 1.481438064916928e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8848079293966293, + "num_tokens": 306174375.0, + "step": 2863 + }, + { + "epoch": 6.524515393386545, + "grad_norm": 3.1875, + "learning_rate": 1.4797183156394462e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.8868988156318665, + "num_tokens": 306282289.0, + "step": 2864 + }, + { + "epoch": 6.526795895096921, + "grad_norm": 2.984375, + "learning_rate": 1.477999145454152e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8846388012170792, + "num_tokens": 306389414.0, + "step": 2865 + }, + { + "epoch": 6.529076396807298, + "grad_norm": 5.15625, + "learning_rate": 1.4762805553368115e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8790937066078186, + "num_tokens": 306496182.0, + "step": 2866 + }, + { + "epoch": 6.531356898517674, + "grad_norm": 3.90625, + "learning_rate": 1.4745625462628654e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8804367631673813, + "num_tokens": 306603764.0, + "step": 2867 + }, + { + "epoch": 6.53363740022805, + "grad_norm": 4.0, + "learning_rate": 1.47284511920742e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8812845051288605, + "num_tokens": 306710535.0, + "step": 2868 + }, + { + "epoch": 6.535917901938427, + "grad_norm": 3.421875, + "learning_rate": 1.4711282751452549e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.8755820095539093, + "num_tokens": 306817409.0, + "step": 2869 + }, + { + "epoch": 6.538198403648803, + "grad_norm": 3.28125, + "learning_rate": 1.4694120150508179e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8860864490270615, + "num_tokens": 306924651.0, + "step": 2870 + }, + { + "epoch": 6.540478905359179, + "grad_norm": 3.921875, + "learning_rate": 1.4676963398982248e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8829345405101776, + "num_tokens": 307032470.0, + "step": 2871 + }, + { + "epoch": 6.542759407069555, + "grad_norm": 3.390625, + "learning_rate": 1.4659812506612608e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8877181857824326, + "num_tokens": 307139315.0, + "step": 2872 + }, + { + "epoch": 6.545039908779931, + "grad_norm": 3.578125, + "learning_rate": 1.4642667483133753e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8820486813783646, + "num_tokens": 307245855.0, + "step": 2873 + }, + { + "epoch": 6.5473204104903076, + "grad_norm": 4.0, + "learning_rate": 1.4625528338276879e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8795684576034546, + "num_tokens": 307352674.0, + "step": 2874 + }, + { + "epoch": 6.549600912200685, + "grad_norm": 3.015625, + "learning_rate": 1.4608395081769833e-06, + "loss": 0.551, + "mean_token_accuracy": 0.885527640581131, + "num_tokens": 307460336.0, + "step": 2875 + }, + { + "epoch": 6.55188141391106, + "grad_norm": 2.875, + "learning_rate": 1.4591267723337122e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8819777965545654, + "num_tokens": 307567742.0, + "step": 2876 + }, + { + "epoch": 6.554161915621437, + "grad_norm": 2.75, + "learning_rate": 1.4574146272699914e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8834614157676697, + "num_tokens": 307674838.0, + "step": 2877 + }, + { + "epoch": 6.556442417331813, + "grad_norm": 3.015625, + "learning_rate": 1.4557030739575988e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8842470943927765, + "num_tokens": 307782038.0, + "step": 2878 + }, + { + "epoch": 6.558722919042189, + "grad_norm": 6.40625, + "learning_rate": 1.4539921133679808e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8773439824581146, + "num_tokens": 307889575.0, + "step": 2879 + }, + { + "epoch": 6.561003420752566, + "grad_norm": 2.609375, + "learning_rate": 1.4522817464722453e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.8828644007444382, + "num_tokens": 307996562.0, + "step": 2880 + }, + { + "epoch": 6.563283922462942, + "grad_norm": 3.03125, + "learning_rate": 1.4505719742411644e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8790508657693863, + "num_tokens": 308103487.0, + "step": 2881 + }, + { + "epoch": 6.565564424173318, + "grad_norm": 3.140625, + "learning_rate": 1.44886279764517e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.881091371178627, + "num_tokens": 308210017.0, + "step": 2882 + }, + { + "epoch": 6.567844925883694, + "grad_norm": 3.890625, + "learning_rate": 1.4471542176543587e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8873138725757599, + "num_tokens": 308316619.0, + "step": 2883 + }, + { + "epoch": 6.57012542759407, + "grad_norm": 3.421875, + "learning_rate": 1.4454462352384885e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8838976472616196, + "num_tokens": 308424126.0, + "step": 2884 + }, + { + "epoch": 6.572405929304447, + "grad_norm": 2.40625, + "learning_rate": 1.4437388513669754e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.880536213517189, + "num_tokens": 308531132.0, + "step": 2885 + }, + { + "epoch": 6.574686431014824, + "grad_norm": 3.09375, + "learning_rate": 1.4420320670088977e-06, + "loss": 0.5665, + "mean_token_accuracy": 0.8818255960941315, + "num_tokens": 308639423.0, + "step": 2886 + }, + { + "epoch": 6.5769669327252, + "grad_norm": 3.265625, + "learning_rate": 1.4403258831329947e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8803568929433823, + "num_tokens": 308746298.0, + "step": 2887 + }, + { + "epoch": 6.579247434435576, + "grad_norm": 3.0625, + "learning_rate": 1.4386203007076632e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8798245489597321, + "num_tokens": 308853686.0, + "step": 2888 + }, + { + "epoch": 6.581527936145952, + "grad_norm": 4.53125, + "learning_rate": 1.4369153207009573e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8845240324735641, + "num_tokens": 308960699.0, + "step": 2889 + }, + { + "epoch": 6.583808437856328, + "grad_norm": 3.84375, + "learning_rate": 1.4352109440805917e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.880035325884819, + "num_tokens": 309067894.0, + "step": 2890 + }, + { + "epoch": 6.586088939566705, + "grad_norm": 2.9375, + "learning_rate": 1.4335071718139379e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8802482336759567, + "num_tokens": 309175232.0, + "step": 2891 + }, + { + "epoch": 6.588369441277081, + "grad_norm": 3.3125, + "learning_rate": 1.4318040048680238e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8893679231405258, + "num_tokens": 309282678.0, + "step": 2892 + }, + { + "epoch": 6.590649942987457, + "grad_norm": 3.671875, + "learning_rate": 1.430101444209535e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8879834562540054, + "num_tokens": 309390130.0, + "step": 2893 + }, + { + "epoch": 6.592930444697833, + "grad_norm": 3.421875, + "learning_rate": 1.4283994908048107e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8846594393253326, + "num_tokens": 309497117.0, + "step": 2894 + }, + { + "epoch": 6.59521094640821, + "grad_norm": 3.171875, + "learning_rate": 1.426698145619847e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8807022422552109, + "num_tokens": 309604240.0, + "step": 2895 + }, + { + "epoch": 6.5974914481185865, + "grad_norm": 2.640625, + "learning_rate": 1.424997409620295e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.881415531039238, + "num_tokens": 309712035.0, + "step": 2896 + }, + { + "epoch": 6.599771949828963, + "grad_norm": 3.140625, + "learning_rate": 1.4232972837714598e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.8861576318740845, + "num_tokens": 309818871.0, + "step": 2897 + }, + { + "epoch": 6.602052451539339, + "grad_norm": 3.265625, + "learning_rate": 1.4215977690382998e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8870658576488495, + "num_tokens": 309926653.0, + "step": 2898 + }, + { + "epoch": 6.604332953249715, + "grad_norm": 2.96875, + "learning_rate": 1.4198988663854276e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8792040795087814, + "num_tokens": 310033580.0, + "step": 2899 + }, + { + "epoch": 6.606613454960091, + "grad_norm": 2.859375, + "learning_rate": 1.4182005767771057e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8829011768102646, + "num_tokens": 310141344.0, + "step": 2900 + }, + { + "epoch": 6.608893956670467, + "grad_norm": 3.734375, + "learning_rate": 1.4165029011772513e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.8812812268733978, + "num_tokens": 310247976.0, + "step": 2901 + }, + { + "epoch": 6.611174458380844, + "grad_norm": 3.1875, + "learning_rate": 1.4148058405494328e-06, + "loss": 0.572, + "mean_token_accuracy": 0.8837906569242477, + "num_tokens": 310354966.0, + "step": 2902 + }, + { + "epoch": 6.61345496009122, + "grad_norm": 3.96875, + "learning_rate": 1.4131093958568695e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8823096752166748, + "num_tokens": 310462145.0, + "step": 2903 + }, + { + "epoch": 6.615735461801596, + "grad_norm": 3.21875, + "learning_rate": 1.4114135680624291e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8795100599527359, + "num_tokens": 310568947.0, + "step": 2904 + }, + { + "epoch": 6.618015963511972, + "grad_norm": 3.046875, + "learning_rate": 1.4097183581286322e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8784335255622864, + "num_tokens": 310675494.0, + "step": 2905 + }, + { + "epoch": 6.620296465222349, + "grad_norm": 4.53125, + "learning_rate": 1.4080237670176456e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8832641243934631, + "num_tokens": 310782897.0, + "step": 2906 + }, + { + "epoch": 6.6225769669327255, + "grad_norm": 3.125, + "learning_rate": 1.4063297956912875e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8814590126276016, + "num_tokens": 310890161.0, + "step": 2907 + }, + { + "epoch": 6.624857468643102, + "grad_norm": 2.5, + "learning_rate": 1.4046364451110234e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.884671688079834, + "num_tokens": 310996832.0, + "step": 2908 + }, + { + "epoch": 6.627137970353478, + "grad_norm": 3.265625, + "learning_rate": 1.4029437162379666e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8783999532461166, + "num_tokens": 311103426.0, + "step": 2909 + }, + { + "epoch": 6.629418472063854, + "grad_norm": 4.3125, + "learning_rate": 1.4012516100328766e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.8824923038482666, + "num_tokens": 311210237.0, + "step": 2910 + }, + { + "epoch": 6.63169897377423, + "grad_norm": 4.40625, + "learning_rate": 1.3995601274561605e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8831194937229156, + "num_tokens": 311317987.0, + "step": 2911 + }, + { + "epoch": 6.633979475484606, + "grad_norm": 3.0625, + "learning_rate": 1.3978692694678711e-06, + "loss": 0.5727, + "mean_token_accuracy": 0.8810025453567505, + "num_tokens": 311424839.0, + "step": 2912 + }, + { + "epoch": 6.636259977194983, + "grad_norm": 2.90625, + "learning_rate": 1.3961790370277068e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.8853645473718643, + "num_tokens": 311531661.0, + "step": 2913 + }, + { + "epoch": 6.638540478905359, + "grad_norm": 2.734375, + "learning_rate": 1.3944894310950113e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8820922076702118, + "num_tokens": 311639163.0, + "step": 2914 + }, + { + "epoch": 6.640820980615736, + "grad_norm": 3.5625, + "learning_rate": 1.3928004526287729e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8839680552482605, + "num_tokens": 311746011.0, + "step": 2915 + }, + { + "epoch": 6.643101482326112, + "grad_norm": 3.59375, + "learning_rate": 1.3911121025876212e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8828011155128479, + "num_tokens": 311852925.0, + "step": 2916 + }, + { + "epoch": 6.645381984036488, + "grad_norm": 3.296875, + "learning_rate": 1.389424381929832e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8824039697647095, + "num_tokens": 311960025.0, + "step": 2917 + }, + { + "epoch": 6.6476624857468645, + "grad_norm": 2.9375, + "learning_rate": 1.3877372916133234e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.8741250783205032, + "num_tokens": 312066896.0, + "step": 2918 + }, + { + "epoch": 6.649942987457241, + "grad_norm": 4.6875, + "learning_rate": 1.3860508325956549e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.8746428936719894, + "num_tokens": 312173856.0, + "step": 2919 + }, + { + "epoch": 6.652223489167617, + "grad_norm": 3.59375, + "learning_rate": 1.3843650058340291e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8829982429742813, + "num_tokens": 312280996.0, + "step": 2920 + }, + { + "epoch": 6.654503990877993, + "grad_norm": 3.859375, + "learning_rate": 1.382679812285287e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8866865336894989, + "num_tokens": 312388440.0, + "step": 2921 + }, + { + "epoch": 6.656784492588369, + "grad_norm": 3.21875, + "learning_rate": 1.3809952529059127e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8881336748600006, + "num_tokens": 312495421.0, + "step": 2922 + }, + { + "epoch": 6.659064994298745, + "grad_norm": 4.0, + "learning_rate": 1.3793113286520293e-06, + "loss": 0.563, + "mean_token_accuracy": 0.884793609380722, + "num_tokens": 312602917.0, + "step": 2923 + }, + { + "epoch": 6.661345496009122, + "grad_norm": 3.390625, + "learning_rate": 1.3776280404794016e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8788406997919083, + "num_tokens": 312710035.0, + "step": 2924 + }, + { + "epoch": 6.663625997719498, + "grad_norm": 2.84375, + "learning_rate": 1.3759453893434285e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8826200515031815, + "num_tokens": 312816809.0, + "step": 2925 + }, + { + "epoch": 6.665906499429875, + "grad_norm": 3.640625, + "learning_rate": 1.3742633761991519e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8815308213233948, + "num_tokens": 312923988.0, + "step": 2926 + }, + { + "epoch": 6.668187001140251, + "grad_norm": 3.28125, + "learning_rate": 1.3725820020012506e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8900953233242035, + "num_tokens": 313031446.0, + "step": 2927 + }, + { + "epoch": 6.670467502850627, + "grad_norm": 2.59375, + "learning_rate": 1.3709012677040385e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.882040336728096, + "num_tokens": 313138636.0, + "step": 2928 + }, + { + "epoch": 6.6727480045610035, + "grad_norm": 2.671875, + "learning_rate": 1.3692211742614686e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8804366439580917, + "num_tokens": 313245263.0, + "step": 2929 + }, + { + "epoch": 6.67502850627138, + "grad_norm": 4.25, + "learning_rate": 1.3675417226271298e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8823003172874451, + "num_tokens": 313353027.0, + "step": 2930 + }, + { + "epoch": 6.677309007981756, + "grad_norm": 3.15625, + "learning_rate": 1.365862913754247e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8799808919429779, + "num_tokens": 313460650.0, + "step": 2931 + }, + { + "epoch": 6.679589509692132, + "grad_norm": 5.90625, + "learning_rate": 1.3641847485956782e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8758883625268936, + "num_tokens": 313567543.0, + "step": 2932 + }, + { + "epoch": 6.681870011402508, + "grad_norm": 3.015625, + "learning_rate": 1.362507228103918e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8851123601198196, + "num_tokens": 313674854.0, + "step": 2933 + }, + { + "epoch": 6.684150513112884, + "grad_norm": 2.59375, + "learning_rate": 1.3608303532310956e-06, + "loss": 0.5605, + "mean_token_accuracy": 0.884984016418457, + "num_tokens": 313781754.0, + "step": 2934 + }, + { + "epoch": 6.6864310148232615, + "grad_norm": 2.96875, + "learning_rate": 1.3591541249289718e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8888740241527557, + "num_tokens": 313889331.0, + "step": 2935 + }, + { + "epoch": 6.688711516533638, + "grad_norm": 2.75, + "learning_rate": 1.357478544148943e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.8864340782165527, + "num_tokens": 313997028.0, + "step": 2936 + }, + { + "epoch": 6.690992018244014, + "grad_norm": 2.78125, + "learning_rate": 1.3558036118420343e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8856192529201508, + "num_tokens": 314104052.0, + "step": 2937 + }, + { + "epoch": 6.69327251995439, + "grad_norm": 3.25, + "learning_rate": 1.3541293289589058e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.882482260465622, + "num_tokens": 314211575.0, + "step": 2938 + }, + { + "epoch": 6.695553021664766, + "grad_norm": 4.125, + "learning_rate": 1.3524556964498482e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.88676318526268, + "num_tokens": 314318790.0, + "step": 2939 + }, + { + "epoch": 6.6978335233751425, + "grad_norm": 2.78125, + "learning_rate": 1.3507827152647835e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8853241056203842, + "num_tokens": 314425965.0, + "step": 2940 + }, + { + "epoch": 6.700114025085519, + "grad_norm": 2.53125, + "learning_rate": 1.3491103863532626e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8847237229347229, + "num_tokens": 314533161.0, + "step": 2941 + }, + { + "epoch": 6.702394526795895, + "grad_norm": 3.46875, + "learning_rate": 1.3474387106644688e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.8763073831796646, + "num_tokens": 314640305.0, + "step": 2942 + }, + { + "epoch": 6.704675028506271, + "grad_norm": 4.5, + "learning_rate": 1.345767689147211e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8827333003282547, + "num_tokens": 314747484.0, + "step": 2943 + }, + { + "epoch": 6.706955530216648, + "grad_norm": 2.90625, + "learning_rate": 1.3440973227499293e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8817901015281677, + "num_tokens": 314854356.0, + "step": 2944 + }, + { + "epoch": 6.7092360319270234, + "grad_norm": 2.6875, + "learning_rate": 1.3424276124206917e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8832406252622604, + "num_tokens": 314961203.0, + "step": 2945 + }, + { + "epoch": 6.7115165336374005, + "grad_norm": 3.0, + "learning_rate": 1.3407585591071944e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8804884552955627, + "num_tokens": 315067663.0, + "step": 2946 + }, + { + "epoch": 6.713797035347777, + "grad_norm": 6.0, + "learning_rate": 1.3390901637567579e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8794064670801163, + "num_tokens": 315175060.0, + "step": 2947 + }, + { + "epoch": 6.716077537058153, + "grad_norm": 3.15625, + "learning_rate": 1.3374224273163334e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8783332705497742, + "num_tokens": 315281889.0, + "step": 2948 + }, + { + "epoch": 6.718358038768529, + "grad_norm": 3.4375, + "learning_rate": 1.3357553507324938e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8839272111654282, + "num_tokens": 315388559.0, + "step": 2949 + }, + { + "epoch": 6.720638540478905, + "grad_norm": 5.5625, + "learning_rate": 1.3340889349514403e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8809895366430283, + "num_tokens": 315495344.0, + "step": 2950 + }, + { + "epoch": 6.7229190421892815, + "grad_norm": 3.9375, + "learning_rate": 1.3324231809189985e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8813484013080597, + "num_tokens": 315602116.0, + "step": 2951 + }, + { + "epoch": 6.725199543899658, + "grad_norm": 3.453125, + "learning_rate": 1.3307580895806194e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8836003094911575, + "num_tokens": 315709172.0, + "step": 2952 + }, + { + "epoch": 6.727480045610034, + "grad_norm": 3.375, + "learning_rate": 1.3290936618813747e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8845668733119965, + "num_tokens": 315816102.0, + "step": 2953 + }, + { + "epoch": 6.72976054732041, + "grad_norm": 2.703125, + "learning_rate": 1.327429898765962e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.8821906894445419, + "num_tokens": 315923130.0, + "step": 2954 + }, + { + "epoch": 6.732041049030787, + "grad_norm": 3.21875, + "learning_rate": 1.3257668011787018e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8804189115762711, + "num_tokens": 316030172.0, + "step": 2955 + }, + { + "epoch": 6.734321550741163, + "grad_norm": 2.734375, + "learning_rate": 1.3241043700635352e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8796321153640747, + "num_tokens": 316136895.0, + "step": 2956 + }, + { + "epoch": 6.7366020524515395, + "grad_norm": 3.09375, + "learning_rate": 1.3224426063640272e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8832730054855347, + "num_tokens": 316243931.0, + "step": 2957 + }, + { + "epoch": 6.738882554161916, + "grad_norm": 3.015625, + "learning_rate": 1.320781511023363e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.883409783244133, + "num_tokens": 316351625.0, + "step": 2958 + }, + { + "epoch": 6.741163055872292, + "grad_norm": 4.84375, + "learning_rate": 1.3191210849843461e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8800751566886902, + "num_tokens": 316459647.0, + "step": 2959 + }, + { + "epoch": 6.743443557582668, + "grad_norm": 4.15625, + "learning_rate": 1.3174613291894039e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8763148337602615, + "num_tokens": 316566718.0, + "step": 2960 + }, + { + "epoch": 6.745724059293044, + "grad_norm": 3.34375, + "learning_rate": 1.3158022445805816e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8837677389383316, + "num_tokens": 316673847.0, + "step": 2961 + }, + { + "epoch": 6.7480045610034205, + "grad_norm": 2.953125, + "learning_rate": 1.3141438320995433e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8806140273809433, + "num_tokens": 316780635.0, + "step": 2962 + }, + { + "epoch": 6.750285062713797, + "grad_norm": 2.796875, + "learning_rate": 1.3124860926875732e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8896244168281555, + "num_tokens": 316887938.0, + "step": 2963 + }, + { + "epoch": 6.752565564424174, + "grad_norm": 2.84375, + "learning_rate": 1.3108290272855697e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8813838809728622, + "num_tokens": 316995335.0, + "step": 2964 + }, + { + "epoch": 6.75484606613455, + "grad_norm": 5.0625, + "learning_rate": 1.309172636834053e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.8819121271371841, + "num_tokens": 317102837.0, + "step": 2965 + }, + { + "epoch": 6.757126567844926, + "grad_norm": 2.90625, + "learning_rate": 1.3075169222731573e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8826228231191635, + "num_tokens": 317210216.0, + "step": 2966 + }, + { + "epoch": 6.759407069555302, + "grad_norm": 3.5, + "learning_rate": 1.305861884542636e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8787815272808075, + "num_tokens": 317317069.0, + "step": 2967 + }, + { + "epoch": 6.7616875712656785, + "grad_norm": 4.875, + "learning_rate": 1.3042075245818542e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8827643990516663, + "num_tokens": 317424086.0, + "step": 2968 + }, + { + "epoch": 6.763968072976055, + "grad_norm": 3.296875, + "learning_rate": 1.3025538433297957e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8830547332763672, + "num_tokens": 317531744.0, + "step": 2969 + }, + { + "epoch": 6.766248574686431, + "grad_norm": 2.765625, + "learning_rate": 1.3009008417250597e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.883535161614418, + "num_tokens": 317638896.0, + "step": 2970 + }, + { + "epoch": 6.768529076396807, + "grad_norm": 2.53125, + "learning_rate": 1.2992485207058548e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.8761639446020126, + "num_tokens": 317745636.0, + "step": 2971 + }, + { + "epoch": 6.770809578107183, + "grad_norm": 2.625, + "learning_rate": 1.2975968812100081e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8862400203943253, + "num_tokens": 317852822.0, + "step": 2972 + }, + { + "epoch": 6.7730900798175595, + "grad_norm": 4.6875, + "learning_rate": 1.295945924174959e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8786645531654358, + "num_tokens": 317959386.0, + "step": 2973 + }, + { + "epoch": 6.775370581527936, + "grad_norm": 3.03125, + "learning_rate": 1.2942956505377585e-06, + "loss": 0.565, + "mean_token_accuracy": 0.8845228105783463, + "num_tokens": 318066201.0, + "step": 2974 + }, + { + "epoch": 6.777651083238313, + "grad_norm": 3.703125, + "learning_rate": 1.2926460612350688e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.883774995803833, + "num_tokens": 318173322.0, + "step": 2975 + }, + { + "epoch": 6.779931584948689, + "grad_norm": 3.421875, + "learning_rate": 1.2909971572031663e-06, + "loss": 0.558, + "mean_token_accuracy": 0.8879963010549545, + "num_tokens": 318281003.0, + "step": 2976 + }, + { + "epoch": 6.782212086659065, + "grad_norm": 3.375, + "learning_rate": 1.2893489393779362e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8839980363845825, + "num_tokens": 318387942.0, + "step": 2977 + }, + { + "epoch": 6.784492588369441, + "grad_norm": 2.53125, + "learning_rate": 1.2877014086948762e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8807279914617538, + "num_tokens": 318494877.0, + "step": 2978 + }, + { + "epoch": 6.7867730900798175, + "grad_norm": 3.171875, + "learning_rate": 1.2860545660890928e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8867141306400299, + "num_tokens": 318601668.0, + "step": 2979 + }, + { + "epoch": 6.789053591790194, + "grad_norm": 2.34375, + "learning_rate": 1.2844084124953006e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8846204280853271, + "num_tokens": 318708916.0, + "step": 2980 + }, + { + "epoch": 6.79133409350057, + "grad_norm": 5.25, + "learning_rate": 1.2827629488478254e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8803286552429199, + "num_tokens": 318816351.0, + "step": 2981 + }, + { + "epoch": 6.793614595210946, + "grad_norm": 3.25, + "learning_rate": 1.2811181760806013e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8851221948862076, + "num_tokens": 318923078.0, + "step": 2982 + }, + { + "epoch": 6.795895096921322, + "grad_norm": 4.21875, + "learning_rate": 1.2794740951271686e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8797405809164047, + "num_tokens": 319029672.0, + "step": 2983 + }, + { + "epoch": 6.798175598631699, + "grad_norm": 3.765625, + "learning_rate": 1.2778307069206764e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8823461681604385, + "num_tokens": 319136574.0, + "step": 2984 + }, + { + "epoch": 6.800456100342076, + "grad_norm": 3.265625, + "learning_rate": 1.2761880123938814e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.8868078589439392, + "num_tokens": 319244374.0, + "step": 2985 + }, + { + "epoch": 6.802736602052452, + "grad_norm": 3.09375, + "learning_rate": 1.2745460124791425e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8795672506093979, + "num_tokens": 319352045.0, + "step": 2986 + }, + { + "epoch": 6.805017103762828, + "grad_norm": 3.59375, + "learning_rate": 1.272904708108429e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8827357441186905, + "num_tokens": 319458393.0, + "step": 2987 + }, + { + "epoch": 6.807297605473204, + "grad_norm": 3.421875, + "learning_rate": 1.2712641002133128e-06, + "loss": 0.6055, + "mean_token_accuracy": 0.8749971687793732, + "num_tokens": 319564852.0, + "step": 2988 + }, + { + "epoch": 6.80957810718358, + "grad_norm": 3.65625, + "learning_rate": 1.2696241897249728e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8824810981750488, + "num_tokens": 319671606.0, + "step": 2989 + }, + { + "epoch": 6.811858608893957, + "grad_norm": 3.578125, + "learning_rate": 1.2679849775741884e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.8874455839395523, + "num_tokens": 319778774.0, + "step": 2990 + }, + { + "epoch": 6.814139110604333, + "grad_norm": 2.84375, + "learning_rate": 1.266346464691346e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.8811825066804886, + "num_tokens": 319885624.0, + "step": 2991 + }, + { + "epoch": 6.816419612314709, + "grad_norm": 3.25, + "learning_rate": 1.2647086520064343e-06, + "loss": 0.575, + "mean_token_accuracy": 0.880624532699585, + "num_tokens": 319992897.0, + "step": 2992 + }, + { + "epoch": 6.818700114025085, + "grad_norm": 2.90625, + "learning_rate": 1.2630715404490424e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8810169398784637, + "num_tokens": 320099665.0, + "step": 2993 + }, + { + "epoch": 6.820980615735461, + "grad_norm": 2.625, + "learning_rate": 1.2614351309483646e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8881228417158127, + "num_tokens": 320207542.0, + "step": 2994 + }, + { + "epoch": 6.823261117445838, + "grad_norm": 2.78125, + "learning_rate": 1.259799424433196e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8811034262180328, + "num_tokens": 320314111.0, + "step": 2995 + }, + { + "epoch": 6.825541619156215, + "grad_norm": 3.171875, + "learning_rate": 1.25816442183193e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8809540122747421, + "num_tokens": 320421117.0, + "step": 2996 + }, + { + "epoch": 6.827822120866591, + "grad_norm": 2.953125, + "learning_rate": 1.2565301240725636e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8778086453676224, + "num_tokens": 320527962.0, + "step": 2997 + }, + { + "epoch": 6.830102622576967, + "grad_norm": 3.015625, + "learning_rate": 1.2548965320826928e-06, + "loss": 0.5667, + "mean_token_accuracy": 0.883897066116333, + "num_tokens": 320634881.0, + "step": 2998 + }, + { + "epoch": 6.832383124287343, + "grad_norm": 3.0, + "learning_rate": 1.2532636467895126e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8828509300947189, + "num_tokens": 320741780.0, + "step": 2999 + }, + { + "epoch": 6.834663625997719, + "grad_norm": 3.375, + "learning_rate": 1.2516314691198172e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.8836653828620911, + "num_tokens": 320849047.0, + "step": 3000 + }, + { + "epoch": 6.836944127708096, + "grad_norm": 3.890625, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8793622553348541, + "num_tokens": 320955644.0, + "step": 3001 + }, + { + "epoch": 6.839224629418472, + "grad_norm": 3.75, + "learning_rate": 1.2483692403560507e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8884700238704681, + "num_tokens": 321063849.0, + "step": 3002 + }, + { + "epoch": 6.841505131128848, + "grad_norm": 2.921875, + "learning_rate": 1.2467391911135562e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8834027796983719, + "num_tokens": 321170372.0, + "step": 3003 + }, + { + "epoch": 6.843785632839225, + "grad_norm": 4.0, + "learning_rate": 1.2451098531977015e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8847081363201141, + "num_tokens": 321277858.0, + "step": 3004 + }, + { + "epoch": 6.846066134549601, + "grad_norm": 2.875, + "learning_rate": 1.2434812275332678e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8904493749141693, + "num_tokens": 321384888.0, + "step": 3005 + }, + { + "epoch": 6.848346636259977, + "grad_norm": 2.796875, + "learning_rate": 1.2418533150446324e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8845214545726776, + "num_tokens": 321491996.0, + "step": 3006 + }, + { + "epoch": 6.850627137970354, + "grad_norm": 2.796875, + "learning_rate": 1.2402261166557647e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8830210715532303, + "num_tokens": 321598410.0, + "step": 3007 + }, + { + "epoch": 6.85290763968073, + "grad_norm": 2.765625, + "learning_rate": 1.2385996332902326e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8822718113660812, + "num_tokens": 321705299.0, + "step": 3008 + }, + { + "epoch": 6.855188141391106, + "grad_norm": 2.765625, + "learning_rate": 1.236973865871196e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8847811222076416, + "num_tokens": 321811999.0, + "step": 3009 + }, + { + "epoch": 6.857468643101482, + "grad_norm": 2.953125, + "learning_rate": 1.2353488153214096e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8817588984966278, + "num_tokens": 321919613.0, + "step": 3010 + }, + { + "epoch": 6.859749144811858, + "grad_norm": 2.640625, + "learning_rate": 1.2337244825632217e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8837725669145584, + "num_tokens": 322026837.0, + "step": 3011 + }, + { + "epoch": 6.862029646522235, + "grad_norm": 3.734375, + "learning_rate": 1.2321008685185699e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8756814897060394, + "num_tokens": 322132737.0, + "step": 3012 + }, + { + "epoch": 6.864310148232612, + "grad_norm": 2.46875, + "learning_rate": 1.2304779741089884e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8843032121658325, + "num_tokens": 322239975.0, + "step": 3013 + }, + { + "epoch": 6.866590649942988, + "grad_norm": 3.90625, + "learning_rate": 1.228855800255599e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8782118856906891, + "num_tokens": 322346564.0, + "step": 3014 + }, + { + "epoch": 6.868871151653364, + "grad_norm": 2.8125, + "learning_rate": 1.2272343478791165e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8836709409952164, + "num_tokens": 322453680.0, + "step": 3015 + }, + { + "epoch": 6.87115165336374, + "grad_norm": 2.6875, + "learning_rate": 1.2256136178998468e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8812815845012665, + "num_tokens": 322561481.0, + "step": 3016 + }, + { + "epoch": 6.873432155074116, + "grad_norm": 3.015625, + "learning_rate": 1.2239936112376858e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8811852186918259, + "num_tokens": 322668877.0, + "step": 3017 + }, + { + "epoch": 6.875712656784493, + "grad_norm": 3.828125, + "learning_rate": 1.2223743288121155e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.8837666660547256, + "num_tokens": 322775613.0, + "step": 3018 + }, + { + "epoch": 6.877993158494869, + "grad_norm": 3.171875, + "learning_rate": 1.2207557715422106e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8822371959686279, + "num_tokens": 322883149.0, + "step": 3019 + }, + { + "epoch": 6.880273660205245, + "grad_norm": 2.859375, + "learning_rate": 1.219137940346633e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.880626305937767, + "num_tokens": 322989979.0, + "step": 3020 + }, + { + "epoch": 6.882554161915621, + "grad_norm": 5.4375, + "learning_rate": 1.2175208361436328e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8829739540815353, + "num_tokens": 323096950.0, + "step": 3021 + }, + { + "epoch": 6.884834663625997, + "grad_norm": 3.4375, + "learning_rate": 1.2159044598510473e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8852163255214691, + "num_tokens": 323204095.0, + "step": 3022 + }, + { + "epoch": 6.887115165336374, + "grad_norm": 2.765625, + "learning_rate": 1.2142888123862992e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8801557719707489, + "num_tokens": 323311535.0, + "step": 3023 + }, + { + "epoch": 6.889395667046751, + "grad_norm": 3.703125, + "learning_rate": 1.2126738946663996e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8806950151920319, + "num_tokens": 323418174.0, + "step": 3024 + }, + { + "epoch": 6.891676168757127, + "grad_norm": 3.171875, + "learning_rate": 1.2110597076079448e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8857726603746414, + "num_tokens": 323524933.0, + "step": 3025 + }, + { + "epoch": 6.893956670467503, + "grad_norm": 2.953125, + "learning_rate": 1.2094462521271156e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8852896243333817, + "num_tokens": 323631823.0, + "step": 3026 + }, + { + "epoch": 6.896237172177879, + "grad_norm": 3.015625, + "learning_rate": 1.2078335291396798e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.8801656812429428, + "num_tokens": 323738919.0, + "step": 3027 + }, + { + "epoch": 6.898517673888255, + "grad_norm": 3.1875, + "learning_rate": 1.2062215395609856e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.8838664293289185, + "num_tokens": 323845761.0, + "step": 3028 + }, + { + "epoch": 6.900798175598632, + "grad_norm": 3.484375, + "learning_rate": 1.2046102843059681e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8809877783060074, + "num_tokens": 323952934.0, + "step": 3029 + }, + { + "epoch": 6.903078677309008, + "grad_norm": 3.046875, + "learning_rate": 1.202999764289145e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.8807922154664993, + "num_tokens": 324060199.0, + "step": 3030 + }, + { + "epoch": 6.905359179019384, + "grad_norm": 2.828125, + "learning_rate": 1.201389980424616e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8789155036211014, + "num_tokens": 324166935.0, + "step": 3031 + }, + { + "epoch": 6.90763968072976, + "grad_norm": 3.609375, + "learning_rate": 1.1997809336260644e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8868138492107391, + "num_tokens": 324273307.0, + "step": 3032 + }, + { + "epoch": 6.909920182440137, + "grad_norm": 2.734375, + "learning_rate": 1.1981726248067521e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8851444870233536, + "num_tokens": 324380120.0, + "step": 3033 + }, + { + "epoch": 6.9122006841505135, + "grad_norm": 2.734375, + "learning_rate": 1.1965650548795251e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8851585388183594, + "num_tokens": 324487235.0, + "step": 3034 + }, + { + "epoch": 6.91448118586089, + "grad_norm": 3.484375, + "learning_rate": 1.1949582247568107e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8827362954616547, + "num_tokens": 324594111.0, + "step": 3035 + }, + { + "epoch": 6.916761687571266, + "grad_norm": 5.15625, + "learning_rate": 1.1933521353506117e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8829712569713593, + "num_tokens": 324701818.0, + "step": 3036 + }, + { + "epoch": 6.919042189281642, + "grad_norm": 3.515625, + "learning_rate": 1.1917467875725148e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8853205442428589, + "num_tokens": 324808789.0, + "step": 3037 + }, + { + "epoch": 6.921322690992018, + "grad_norm": 2.828125, + "learning_rate": 1.1901421823336856e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.885327011346817, + "num_tokens": 324916062.0, + "step": 3038 + }, + { + "epoch": 6.923603192702394, + "grad_norm": 4.0625, + "learning_rate": 1.188538320544865e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.880491703748703, + "num_tokens": 325023122.0, + "step": 3039 + }, + { + "epoch": 6.925883694412771, + "grad_norm": 2.859375, + "learning_rate": 1.1869352031163746e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.880371481180191, + "num_tokens": 325130007.0, + "step": 3040 + }, + { + "epoch": 6.928164196123147, + "grad_norm": 3.578125, + "learning_rate": 1.1853328309581139e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.884147435426712, + "num_tokens": 325236627.0, + "step": 3041 + }, + { + "epoch": 6.930444697833523, + "grad_norm": 3.328125, + "learning_rate": 1.183731204979557e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8821654170751572, + "num_tokens": 325344292.0, + "step": 3042 + }, + { + "epoch": 6.932725199543899, + "grad_norm": 2.96875, + "learning_rate": 1.182130326089758e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8848479390144348, + "num_tokens": 325451752.0, + "step": 3043 + }, + { + "epoch": 6.935005701254276, + "grad_norm": 4.71875, + "learning_rate": 1.1805301951973423e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8822412639856339, + "num_tokens": 325558775.0, + "step": 3044 + }, + { + "epoch": 6.9372862029646525, + "grad_norm": 5.1875, + "learning_rate": 1.1789308132105145e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8826945275068283, + "num_tokens": 325665451.0, + "step": 3045 + }, + { + "epoch": 6.939566704675029, + "grad_norm": 3.203125, + "learning_rate": 1.1773321810370527e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8854436427354813, + "num_tokens": 325772180.0, + "step": 3046 + }, + { + "epoch": 6.941847206385405, + "grad_norm": 2.515625, + "learning_rate": 1.1757342995843103e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8795358538627625, + "num_tokens": 325879003.0, + "step": 3047 + }, + { + "epoch": 6.944127708095781, + "grad_norm": 3.125, + "learning_rate": 1.1741371697592134e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.8838000446557999, + "num_tokens": 325986026.0, + "step": 3048 + }, + { + "epoch": 6.946408209806157, + "grad_norm": 3.625, + "learning_rate": 1.1725407924682628e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.8830106258392334, + "num_tokens": 326093663.0, + "step": 3049 + }, + { + "epoch": 6.9486887115165334, + "grad_norm": 4.8125, + "learning_rate": 1.17094516861753e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8868276029825211, + "num_tokens": 326201499.0, + "step": 3050 + }, + { + "epoch": 6.95096921322691, + "grad_norm": 3.1875, + "learning_rate": 1.1693502991126609e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8848345726728439, + "num_tokens": 326308543.0, + "step": 3051 + }, + { + "epoch": 6.953249714937286, + "grad_norm": 3.359375, + "learning_rate": 1.1677561848588734e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.8820251226425171, + "num_tokens": 326415635.0, + "step": 3052 + }, + { + "epoch": 6.955530216647663, + "grad_norm": 3.375, + "learning_rate": 1.166162826760955e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8849399089813232, + "num_tokens": 326523279.0, + "step": 3053 + }, + { + "epoch": 6.957810718358039, + "grad_norm": 2.84375, + "learning_rate": 1.1645702257232663e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8775086104869843, + "num_tokens": 326630109.0, + "step": 3054 + }, + { + "epoch": 6.960091220068415, + "grad_norm": 3.0, + "learning_rate": 1.1629783826497351e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8805986046791077, + "num_tokens": 326737084.0, + "step": 3055 + }, + { + "epoch": 6.9623717217787915, + "grad_norm": 2.625, + "learning_rate": 1.161387298443863e-06, + "loss": 0.5631, + "mean_token_accuracy": 0.8876520395278931, + "num_tokens": 326844156.0, + "step": 3056 + }, + { + "epoch": 6.964652223489168, + "grad_norm": 3.1875, + "learning_rate": 1.1597969740087159e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8821894228458405, + "num_tokens": 326951005.0, + "step": 3057 + }, + { + "epoch": 6.966932725199544, + "grad_norm": 3.953125, + "learning_rate": 1.1582074102469332e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8848588168621063, + "num_tokens": 327058614.0, + "step": 3058 + }, + { + "epoch": 6.96921322690992, + "grad_norm": 3.0, + "learning_rate": 1.1566186080607198e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8877767473459244, + "num_tokens": 327166015.0, + "step": 3059 + }, + { + "epoch": 6.971493728620296, + "grad_norm": 3.96875, + "learning_rate": 1.1550305683518506e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8813445121049881, + "num_tokens": 327273077.0, + "step": 3060 + }, + { + "epoch": 6.9737742303306725, + "grad_norm": 3.109375, + "learning_rate": 1.1534432920216643e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.8741457164287567, + "num_tokens": 327379556.0, + "step": 3061 + }, + { + "epoch": 6.976054732041049, + "grad_norm": 2.578125, + "learning_rate": 1.151856779971069e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8825311958789825, + "num_tokens": 327486300.0, + "step": 3062 + }, + { + "epoch": 6.978335233751425, + "grad_norm": 2.8125, + "learning_rate": 1.1502710331005384e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.8804716914892197, + "num_tokens": 327593244.0, + "step": 3063 + }, + { + "epoch": 6.980615735461802, + "grad_norm": 2.96875, + "learning_rate": 1.148686052310112e-06, + "loss": 0.596, + "mean_token_accuracy": 0.875918909907341, + "num_tokens": 327699724.0, + "step": 3064 + }, + { + "epoch": 6.982896237172178, + "grad_norm": 3.40625, + "learning_rate": 1.147101838499395e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8763432502746582, + "num_tokens": 327806813.0, + "step": 3065 + }, + { + "epoch": 6.985176738882554, + "grad_norm": 2.875, + "learning_rate": 1.145518392567555e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8882738053798676, + "num_tokens": 327913705.0, + "step": 3066 + }, + { + "epoch": 6.9874572405929305, + "grad_norm": 3.09375, + "learning_rate": 1.1439357154133263e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8874611854553223, + "num_tokens": 328021868.0, + "step": 3067 + }, + { + "epoch": 6.989737742303307, + "grad_norm": 2.828125, + "learning_rate": 1.1423538079350053e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8833891451358795, + "num_tokens": 328129123.0, + "step": 3068 + }, + { + "epoch": 6.992018244013683, + "grad_norm": 3.03125, + "learning_rate": 1.1407726710304525e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8766579329967499, + "num_tokens": 328235771.0, + "step": 3069 + }, + { + "epoch": 6.994298745724059, + "grad_norm": 4.90625, + "learning_rate": 1.139192305597092e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8802380710840225, + "num_tokens": 328342486.0, + "step": 3070 + }, + { + "epoch": 6.996579247434435, + "grad_norm": 3.625, + "learning_rate": 1.1376127125319065e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8793238997459412, + "num_tokens": 328449792.0, + "step": 3071 + }, + { + "epoch": 6.9988597491448115, + "grad_norm": 3.296875, + "learning_rate": 1.1360338927314432e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8853235244750977, + "num_tokens": 328556382.0, + "step": 3072 + }, + { + "epoch": 7.0, + "grad_norm": 5.625, + "learning_rate": 1.1344558470918098e-06, + "loss": 0.6197, + "mean_token_accuracy": 0.8733022511005402, + "num_tokens": 328595624.0, + "step": 3073 + }, + { + "epoch": 7.002280501710376, + "grad_norm": 5.1875, + "learning_rate": 1.1328785765086752e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.8809218257665634, + "num_tokens": 328702654.0, + "step": 3074 + }, + { + "epoch": 7.004561003420752, + "grad_norm": 2.96875, + "learning_rate": 1.131302081877268e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8839420080184937, + "num_tokens": 328809480.0, + "step": 3075 + }, + { + "epoch": 7.006841505131129, + "grad_norm": 3.0625, + "learning_rate": 1.1297263640923745e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8816477954387665, + "num_tokens": 328916070.0, + "step": 3076 + }, + { + "epoch": 7.009122006841505, + "grad_norm": 2.71875, + "learning_rate": 1.1281514240483427e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8879576325416565, + "num_tokens": 329022615.0, + "step": 3077 + }, + { + "epoch": 7.011402508551882, + "grad_norm": 3.875, + "learning_rate": 1.1265772626390786e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.888176754117012, + "num_tokens": 329129873.0, + "step": 3078 + }, + { + "epoch": 7.013683010262258, + "grad_norm": 3.0, + "learning_rate": 1.1250038807580449e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8815352469682693, + "num_tokens": 329236596.0, + "step": 3079 + }, + { + "epoch": 7.015963511972634, + "grad_norm": 3.359375, + "learning_rate": 1.1234312792982627e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.8854427635669708, + "num_tokens": 329344039.0, + "step": 3080 + }, + { + "epoch": 7.015963511972634, + "eval_loss": 0.5865316987037659, + "eval_mean_token_accuracy": 0.8799761234580791, + "eval_num_tokens": 329344039.0, + "eval_runtime": 58.6794, + "eval_samples_per_second": 142.895, + "eval_steps_per_second": 4.482, + "step": 3080 + }, + { + "epoch": 7.01824401368301, + "grad_norm": 3.078125, + "learning_rate": 1.1218594591523118e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.8825812339782715, + "num_tokens": 329451249.0, + "step": 3081 + }, + { + "epoch": 7.020524515393387, + "grad_norm": 2.546875, + "learning_rate": 1.120288421212325e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8850286900997162, + "num_tokens": 329558258.0, + "step": 3082 + }, + { + "epoch": 7.022805017103763, + "grad_norm": 2.859375, + "learning_rate": 1.1187181663699935e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8803520053625107, + "num_tokens": 329665648.0, + "step": 3083 + }, + { + "epoch": 7.025085518814139, + "grad_norm": 3.453125, + "learning_rate": 1.1171486955165645e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.886691614985466, + "num_tokens": 329772942.0, + "step": 3084 + }, + { + "epoch": 7.027366020524515, + "grad_norm": 2.5625, + "learning_rate": 1.115580009542839e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8843899071216583, + "num_tokens": 329880278.0, + "step": 3085 + }, + { + "epoch": 7.029646522234891, + "grad_norm": 2.875, + "learning_rate": 1.1140121093391736e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8839717209339142, + "num_tokens": 329987716.0, + "step": 3086 + }, + { + "epoch": 7.031927023945268, + "grad_norm": 3.34375, + "learning_rate": 1.1124449957954764e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8822407573461533, + "num_tokens": 330094855.0, + "step": 3087 + }, + { + "epoch": 7.034207525655645, + "grad_norm": 2.484375, + "learning_rate": 1.110878669801212e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8854365050792694, + "num_tokens": 330202300.0, + "step": 3088 + }, + { + "epoch": 7.036488027366021, + "grad_norm": 2.8125, + "learning_rate": 1.1093131322453966e-06, + "loss": 0.562, + "mean_token_accuracy": 0.8828160762786865, + "num_tokens": 330309184.0, + "step": 3089 + }, + { + "epoch": 7.038768529076397, + "grad_norm": 2.96875, + "learning_rate": 1.1077483840165986e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8839673697948456, + "num_tokens": 330415860.0, + "step": 3090 + }, + { + "epoch": 7.041049030786773, + "grad_norm": 2.796875, + "learning_rate": 1.10618442600294e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8864448517560959, + "num_tokens": 330522700.0, + "step": 3091 + }, + { + "epoch": 7.043329532497149, + "grad_norm": 2.703125, + "learning_rate": 1.1046212590920931e-06, + "loss": 0.55, + "mean_token_accuracy": 0.8861220180988312, + "num_tokens": 330630445.0, + "step": 3092 + }, + { + "epoch": 7.045610034207526, + "grad_norm": 3.25, + "learning_rate": 1.10305888417128e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8770045042037964, + "num_tokens": 330737176.0, + "step": 3093 + }, + { + "epoch": 7.047890535917902, + "grad_norm": 2.765625, + "learning_rate": 1.101497302127275e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.880455732345581, + "num_tokens": 330843523.0, + "step": 3094 + }, + { + "epoch": 7.050171037628278, + "grad_norm": 2.703125, + "learning_rate": 1.0999365138464024e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8857882469892502, + "num_tokens": 330950912.0, + "step": 3095 + }, + { + "epoch": 7.052451539338654, + "grad_norm": 3.546875, + "learning_rate": 1.0983765202145351e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.8827014863491058, + "num_tokens": 331057394.0, + "step": 3096 + }, + { + "epoch": 7.05473204104903, + "grad_norm": 3.734375, + "learning_rate": 1.0968173221170966e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.88177290558815, + "num_tokens": 331164519.0, + "step": 3097 + }, + { + "epoch": 7.0570125427594075, + "grad_norm": 2.578125, + "learning_rate": 1.0952589204390557e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.8831107765436172, + "num_tokens": 331272101.0, + "step": 3098 + }, + { + "epoch": 7.059293044469784, + "grad_norm": 3.46875, + "learning_rate": 1.0937013160649328e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8797809779644012, + "num_tokens": 331379251.0, + "step": 3099 + }, + { + "epoch": 7.06157354618016, + "grad_norm": 3.40625, + "learning_rate": 1.0921445098787923e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8834618926048279, + "num_tokens": 331486795.0, + "step": 3100 + }, + { + "epoch": 7.063854047890536, + "grad_norm": 3.375, + "learning_rate": 1.0905885027642484e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.882593035697937, + "num_tokens": 331593690.0, + "step": 3101 + }, + { + "epoch": 7.066134549600912, + "grad_norm": 4.21875, + "learning_rate": 1.0890332956044614e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8840242773294449, + "num_tokens": 331700378.0, + "step": 3102 + }, + { + "epoch": 7.068415051311288, + "grad_norm": 2.921875, + "learning_rate": 1.0874788892821354e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.886967346072197, + "num_tokens": 331807432.0, + "step": 3103 + }, + { + "epoch": 7.070695553021665, + "grad_norm": 2.5625, + "learning_rate": 1.0859252846795215e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.8852237462997437, + "num_tokens": 331914837.0, + "step": 3104 + }, + { + "epoch": 7.072976054732041, + "grad_norm": 3.765625, + "learning_rate": 1.0843724826784165e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.8843671381473541, + "num_tokens": 332021366.0, + "step": 3105 + }, + { + "epoch": 7.075256556442417, + "grad_norm": 3.359375, + "learning_rate": 1.0828204841601608e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8801092207431793, + "num_tokens": 332128357.0, + "step": 3106 + }, + { + "epoch": 7.077537058152793, + "grad_norm": 3.1875, + "learning_rate": 1.0812692900056384e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8785477876663208, + "num_tokens": 332235371.0, + "step": 3107 + }, + { + "epoch": 7.07981755986317, + "grad_norm": 3.09375, + "learning_rate": 1.0797189010952784e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8777376860380173, + "num_tokens": 332342307.0, + "step": 3108 + }, + { + "epoch": 7.0820980615735465, + "grad_norm": 4.125, + "learning_rate": 1.0781693183090495e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.8822515457868576, + "num_tokens": 332449426.0, + "step": 3109 + }, + { + "epoch": 7.084378563283923, + "grad_norm": 3.390625, + "learning_rate": 1.076620542526466e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8819447606801987, + "num_tokens": 332556668.0, + "step": 3110 + }, + { + "epoch": 7.086659064994299, + "grad_norm": 3.484375, + "learning_rate": 1.0750725746265832e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8780965954065323, + "num_tokens": 332664662.0, + "step": 3111 + }, + { + "epoch": 7.088939566704675, + "grad_norm": 2.578125, + "learning_rate": 1.0735254154879979e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.880235344171524, + "num_tokens": 332771413.0, + "step": 3112 + }, + { + "epoch": 7.091220068415051, + "grad_norm": 5.21875, + "learning_rate": 1.0719790659888481e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.8757160604000092, + "num_tokens": 332878585.0, + "step": 3113 + }, + { + "epoch": 7.0935005701254275, + "grad_norm": 3.078125, + "learning_rate": 1.070433527006811e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8773680776357651, + "num_tokens": 332985770.0, + "step": 3114 + }, + { + "epoch": 7.095781071835804, + "grad_norm": 2.6875, + "learning_rate": 1.0688887994191049e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8812025189399719, + "num_tokens": 333093197.0, + "step": 3115 + }, + { + "epoch": 7.09806157354618, + "grad_norm": 3.4375, + "learning_rate": 1.0673448841024875e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8841598778963089, + "num_tokens": 333200787.0, + "step": 3116 + }, + { + "epoch": 7.100342075256556, + "grad_norm": 2.96875, + "learning_rate": 1.0658017819332556e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8818695098161697, + "num_tokens": 333308356.0, + "step": 3117 + }, + { + "epoch": 7.102622576966933, + "grad_norm": 4.8125, + "learning_rate": 1.064259493787244e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8795827180147171, + "num_tokens": 333415586.0, + "step": 3118 + }, + { + "epoch": 7.104903078677309, + "grad_norm": 3.109375, + "learning_rate": 1.0627180205398263e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8810782432556152, + "num_tokens": 333522537.0, + "step": 3119 + }, + { + "epoch": 7.1071835803876855, + "grad_norm": 2.953125, + "learning_rate": 1.0611773630659117e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8795880377292633, + "num_tokens": 333629886.0, + "step": 3120 + }, + { + "epoch": 7.109464082098062, + "grad_norm": 4.40625, + "learning_rate": 1.0596375222399491e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8833544999361038, + "num_tokens": 333737026.0, + "step": 3121 + }, + { + "epoch": 7.111744583808438, + "grad_norm": 2.875, + "learning_rate": 1.0580984989359205e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8837230205535889, + "num_tokens": 333843902.0, + "step": 3122 + }, + { + "epoch": 7.114025085518814, + "grad_norm": 2.84375, + "learning_rate": 1.0565602940273472e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8842257410287857, + "num_tokens": 333951056.0, + "step": 3123 + }, + { + "epoch": 7.11630558722919, + "grad_norm": 3.609375, + "learning_rate": 1.055022908387285e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8821539282798767, + "num_tokens": 334058094.0, + "step": 3124 + }, + { + "epoch": 7.1185860889395665, + "grad_norm": 4.21875, + "learning_rate": 1.053486342888323e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8864398002624512, + "num_tokens": 334165433.0, + "step": 3125 + }, + { + "epoch": 7.120866590649943, + "grad_norm": 3.09375, + "learning_rate": 1.0519505984025865e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8731893301010132, + "num_tokens": 334272058.0, + "step": 3126 + }, + { + "epoch": 7.123147092360319, + "grad_norm": 5.34375, + "learning_rate": 1.050415675801735e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8889250755310059, + "num_tokens": 334379380.0, + "step": 3127 + }, + { + "epoch": 7.125427594070696, + "grad_norm": 2.84375, + "learning_rate": 1.0488815759569605e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.8852666765451431, + "num_tokens": 334486391.0, + "step": 3128 + }, + { + "epoch": 7.127708095781072, + "grad_norm": 4.09375, + "learning_rate": 1.0473482997389891e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8808193802833557, + "num_tokens": 334593174.0, + "step": 3129 + }, + { + "epoch": 7.129988597491448, + "grad_norm": 3.25, + "learning_rate": 1.0458158480180777e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8826655298471451, + "num_tokens": 334700093.0, + "step": 3130 + }, + { + "epoch": 7.1322690992018245, + "grad_norm": 3.140625, + "learning_rate": 1.0442842216640168e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8832122981548309, + "num_tokens": 334807273.0, + "step": 3131 + }, + { + "epoch": 7.134549600912201, + "grad_norm": 2.90625, + "learning_rate": 1.042753421546128e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.883466050028801, + "num_tokens": 334914093.0, + "step": 3132 + }, + { + "epoch": 7.136830102622577, + "grad_norm": 2.921875, + "learning_rate": 1.0412234485332636e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8840949237346649, + "num_tokens": 335021072.0, + "step": 3133 + }, + { + "epoch": 7.139110604332953, + "grad_norm": 3.046875, + "learning_rate": 1.0396943034938077e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8802383244037628, + "num_tokens": 335127732.0, + "step": 3134 + }, + { + "epoch": 7.141391106043329, + "grad_norm": 3.671875, + "learning_rate": 1.0381659872956732e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.8789878785610199, + "num_tokens": 335234354.0, + "step": 3135 + }, + { + "epoch": 7.1436716077537055, + "grad_norm": 2.71875, + "learning_rate": 1.0366385008063015e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8824616819620132, + "num_tokens": 335341184.0, + "step": 3136 + }, + { + "epoch": 7.145952109464082, + "grad_norm": 2.625, + "learning_rate": 1.0351118448926658e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8839305341243744, + "num_tokens": 335448536.0, + "step": 3137 + }, + { + "epoch": 7.148232611174459, + "grad_norm": 3.0625, + "learning_rate": 1.0335860204212662e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.880255714058876, + "num_tokens": 335555399.0, + "step": 3138 + }, + { + "epoch": 7.150513112884835, + "grad_norm": 2.875, + "learning_rate": 1.0320610282581309e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8826555460691452, + "num_tokens": 335662374.0, + "step": 3139 + }, + { + "epoch": 7.152793614595211, + "grad_norm": 2.640625, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8849854469299316, + "num_tokens": 335769670.0, + "step": 3140 + }, + { + "epoch": 7.155074116305587, + "grad_norm": 3.53125, + "learning_rate": 1.029013544318407e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8815399259328842, + "num_tokens": 335876896.0, + "step": 3141 + }, + { + "epoch": 7.1573546180159635, + "grad_norm": 3.5625, + "learning_rate": 1.0274910542715103e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8821052312850952, + "num_tokens": 335983657.0, + "step": 3142 + }, + { + "epoch": 7.15963511972634, + "grad_norm": 4.625, + "learning_rate": 1.025969399992264e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8829544335603714, + "num_tokens": 336090948.0, + "step": 3143 + }, + { + "epoch": 7.161915621436716, + "grad_norm": 4.40625, + "learning_rate": 1.0244485823443281e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8804685175418854, + "num_tokens": 336197903.0, + "step": 3144 + }, + { + "epoch": 7.164196123147092, + "grad_norm": 2.421875, + "learning_rate": 1.0229286021908913e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8843429386615753, + "num_tokens": 336305488.0, + "step": 3145 + }, + { + "epoch": 7.166476624857468, + "grad_norm": 3.15625, + "learning_rate": 1.021409460394663e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8926713168621063, + "num_tokens": 336413922.0, + "step": 3146 + }, + { + "epoch": 7.168757126567845, + "grad_norm": 2.875, + "learning_rate": 1.0198911578178797e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8805201500654221, + "num_tokens": 336521353.0, + "step": 3147 + }, + { + "epoch": 7.1710376282782216, + "grad_norm": 2.859375, + "learning_rate": 1.0183736953223005e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8848622888326645, + "num_tokens": 336628098.0, + "step": 3148 + }, + { + "epoch": 7.173318129988598, + "grad_norm": 4.5, + "learning_rate": 1.0168570737692082e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8816571831703186, + "num_tokens": 336735354.0, + "step": 3149 + }, + { + "epoch": 7.175598631698974, + "grad_norm": 2.859375, + "learning_rate": 1.0153412940194073e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8790102005004883, + "num_tokens": 336843105.0, + "step": 3150 + }, + { + "epoch": 7.17787913340935, + "grad_norm": 4.09375, + "learning_rate": 1.0138263569332268e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8861614763736725, + "num_tokens": 336950109.0, + "step": 3151 + }, + { + "epoch": 7.180159635119726, + "grad_norm": 3.703125, + "learning_rate": 1.0123122633705131e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8783406764268875, + "num_tokens": 337056741.0, + "step": 3152 + }, + { + "epoch": 7.1824401368301025, + "grad_norm": 3.234375, + "learning_rate": 1.0107990141906378e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8800586462020874, + "num_tokens": 337163436.0, + "step": 3153 + }, + { + "epoch": 7.184720638540479, + "grad_norm": 3.0, + "learning_rate": 1.0092866102524922e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8837520331144333, + "num_tokens": 337270521.0, + "step": 3154 + }, + { + "epoch": 7.187001140250855, + "grad_norm": 4.34375, + "learning_rate": 1.0077750524144871e-06, + "loss": 0.56, + "mean_token_accuracy": 0.887342780828476, + "num_tokens": 337377963.0, + "step": 3155 + }, + { + "epoch": 7.189281641961231, + "grad_norm": 2.90625, + "learning_rate": 1.0062643415345546e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8867213129997253, + "num_tokens": 337484753.0, + "step": 3156 + }, + { + "epoch": 7.191562143671608, + "grad_norm": 3.21875, + "learning_rate": 1.0047544784701435e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8835946470499039, + "num_tokens": 337592359.0, + "step": 3157 + }, + { + "epoch": 7.193842645381984, + "grad_norm": 3.28125, + "learning_rate": 1.0032454640782232e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.8831405937671661, + "num_tokens": 337698920.0, + "step": 3158 + }, + { + "epoch": 7.196123147092361, + "grad_norm": 2.84375, + "learning_rate": 1.0017372992152819e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.8846133053302765, + "num_tokens": 337806568.0, + "step": 3159 + }, + { + "epoch": 7.198403648802737, + "grad_norm": 2.734375, + "learning_rate": 1.0002299847373243e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.88174769282341, + "num_tokens": 337913427.0, + "step": 3160 + }, + { + "epoch": 7.200684150513113, + "grad_norm": 2.71875, + "learning_rate": 9.987235214998741e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8783168196678162, + "num_tokens": 338020074.0, + "step": 3161 + }, + { + "epoch": 7.202964652223489, + "grad_norm": 3.234375, + "learning_rate": 9.972179103579687e-07, + "loss": 0.5932, + "mean_token_accuracy": 0.8791584521532059, + "num_tokens": 338126865.0, + "step": 3162 + }, + { + "epoch": 7.205245153933865, + "grad_norm": 2.984375, + "learning_rate": 9.957131521661655e-07, + "loss": 0.561, + "mean_token_accuracy": 0.8836409598588943, + "num_tokens": 338234022.0, + "step": 3163 + }, + { + "epoch": 7.2075256556442415, + "grad_norm": 2.6875, + "learning_rate": 9.942092477785365e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.8865296542644501, + "num_tokens": 338341091.0, + "step": 3164 + }, + { + "epoch": 7.209806157354618, + "grad_norm": 2.875, + "learning_rate": 9.927061980486668e-07, + "loss": 0.5629, + "mean_token_accuracy": 0.883445993065834, + "num_tokens": 338447802.0, + "step": 3165 + }, + { + "epoch": 7.212086659064994, + "grad_norm": 3.359375, + "learning_rate": 9.9120400382966e-07, + "loss": 0.5991, + "mean_token_accuracy": 0.8793905973434448, + "num_tokens": 338554661.0, + "step": 3166 + }, + { + "epoch": 7.214367160775371, + "grad_norm": 5.5625, + "learning_rate": 9.897026659741328e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8824618011713028, + "num_tokens": 338661638.0, + "step": 3167 + }, + { + "epoch": 7.216647662485747, + "grad_norm": 6.0, + "learning_rate": 9.882021853342143e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.882818415760994, + "num_tokens": 338768332.0, + "step": 3168 + }, + { + "epoch": 7.218928164196123, + "grad_norm": 3.046875, + "learning_rate": 9.867025627615493e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.8801371902227402, + "num_tokens": 338875605.0, + "step": 3169 + }, + { + "epoch": 7.2212086659065, + "grad_norm": 3.046875, + "learning_rate": 9.852037991072941e-07, + "loss": 0.5702, + "mean_token_accuracy": 0.8800628185272217, + "num_tokens": 338982521.0, + "step": 3170 + }, + { + "epoch": 7.223489167616876, + "grad_norm": 4.21875, + "learning_rate": 9.837058952221182e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8799264430999756, + "num_tokens": 339089755.0, + "step": 3171 + }, + { + "epoch": 7.225769669327252, + "grad_norm": 4.59375, + "learning_rate": 9.822088519562038e-07, + "loss": 0.5651, + "mean_token_accuracy": 0.8843246251344681, + "num_tokens": 339196888.0, + "step": 3172 + }, + { + "epoch": 7.228050171037628, + "grad_norm": 2.5625, + "learning_rate": 9.80712670159242e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8821865916252136, + "num_tokens": 339303828.0, + "step": 3173 + }, + { + "epoch": 7.230330672748004, + "grad_norm": 4.3125, + "learning_rate": 9.792173506804378e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.880447655916214, + "num_tokens": 339410331.0, + "step": 3174 + }, + { + "epoch": 7.2326111744583805, + "grad_norm": 4.40625, + "learning_rate": 9.777228943685055e-07, + "loss": 0.5748, + "mean_token_accuracy": 0.8847004473209381, + "num_tokens": 339517573.0, + "step": 3175 + }, + { + "epoch": 7.234891676168757, + "grad_norm": 2.859375, + "learning_rate": 9.762293020716696e-07, + "loss": 0.5744, + "mean_token_accuracy": 0.8809026032686234, + "num_tokens": 339624146.0, + "step": 3176 + }, + { + "epoch": 7.237172177879134, + "grad_norm": 2.75, + "learning_rate": 9.74736574637665e-07, + "loss": 0.5662, + "mean_token_accuracy": 0.8855401873588562, + "num_tokens": 339731001.0, + "step": 3177 + }, + { + "epoch": 7.23945267958951, + "grad_norm": 3.203125, + "learning_rate": 9.732447129137337e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.8829121142625809, + "num_tokens": 339838319.0, + "step": 3178 + }, + { + "epoch": 7.241733181299886, + "grad_norm": 3.484375, + "learning_rate": 9.717537177466279e-07, + "loss": 0.5855, + "mean_token_accuracy": 0.8830237686634064, + "num_tokens": 339945278.0, + "step": 3179 + }, + { + "epoch": 7.244013683010262, + "grad_norm": 3.984375, + "learning_rate": 9.702635899826082e-07, + "loss": 0.5612, + "mean_token_accuracy": 0.8862001597881317, + "num_tokens": 340052303.0, + "step": 3180 + }, + { + "epoch": 7.246294184720639, + "grad_norm": 3.734375, + "learning_rate": 9.687743304674421e-07, + "loss": 0.5634, + "mean_token_accuracy": 0.8842123001813889, + "num_tokens": 340158489.0, + "step": 3181 + }, + { + "epoch": 7.248574686431015, + "grad_norm": 3.0625, + "learning_rate": 9.672859400464046e-07, + "loss": 0.5972, + "mean_token_accuracy": 0.8775025904178619, + "num_tokens": 340265222.0, + "step": 3182 + }, + { + "epoch": 7.250855188141391, + "grad_norm": 2.921875, + "learning_rate": 9.657984195642783e-07, + "loss": 0.5997, + "mean_token_accuracy": 0.8756692260503769, + "num_tokens": 340371418.0, + "step": 3183 + }, + { + "epoch": 7.253135689851767, + "grad_norm": 2.609375, + "learning_rate": 9.64311769865349e-07, + "loss": 0.5728, + "mean_token_accuracy": 0.8807047456502914, + "num_tokens": 340478369.0, + "step": 3184 + }, + { + "epoch": 7.255416191562143, + "grad_norm": 4.03125, + "learning_rate": 9.628259917934118e-07, + "loss": 0.566, + "mean_token_accuracy": 0.8846138417720795, + "num_tokens": 340585428.0, + "step": 3185 + }, + { + "epoch": 7.2576966932725195, + "grad_norm": 3.0625, + "learning_rate": 9.613410861917661e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.8808754086494446, + "num_tokens": 340692420.0, + "step": 3186 + }, + { + "epoch": 7.259977194982897, + "grad_norm": 3.65625, + "learning_rate": 9.59857053903214e-07, + "loss": 0.5659, + "mean_token_accuracy": 0.8816579431295395, + "num_tokens": 340799529.0, + "step": 3187 + }, + { + "epoch": 7.262257696693273, + "grad_norm": 3.609375, + "learning_rate": 9.583738957700653e-07, + "loss": 0.5858, + "mean_token_accuracy": 0.8804793953895569, + "num_tokens": 340906789.0, + "step": 3188 + }, + { + "epoch": 7.264538198403649, + "grad_norm": 2.640625, + "learning_rate": 9.568916126341305e-07, + "loss": 0.5822, + "mean_token_accuracy": 0.8775817602872849, + "num_tokens": 341013458.0, + "step": 3189 + }, + { + "epoch": 7.266818700114025, + "grad_norm": 3.296875, + "learning_rate": 9.554102053367253e-07, + "loss": 0.5864, + "mean_token_accuracy": 0.8787823021411896, + "num_tokens": 341120727.0, + "step": 3190 + }, + { + "epoch": 7.269099201824401, + "grad_norm": 4.6875, + "learning_rate": 9.53929674718668e-07, + "loss": 0.5834, + "mean_token_accuracy": 0.8785655200481415, + "num_tokens": 341227639.0, + "step": 3191 + }, + { + "epoch": 7.271379703534778, + "grad_norm": 3.1875, + "learning_rate": 9.524500216202795e-07, + "loss": 0.5885, + "mean_token_accuracy": 0.8779048174619675, + "num_tokens": 341334736.0, + "step": 3192 + }, + { + "epoch": 7.273660205245154, + "grad_norm": 3.671875, + "learning_rate": 9.50971246881382e-07, + "loss": 0.5946, + "mean_token_accuracy": 0.8770471662282944, + "num_tokens": 341441533.0, + "step": 3193 + }, + { + "epoch": 7.27594070695553, + "grad_norm": 3.890625, + "learning_rate": 9.494933513413007e-07, + "loss": 0.5982, + "mean_token_accuracy": 0.8785821348428726, + "num_tokens": 341547940.0, + "step": 3194 + }, + { + "epoch": 7.278221208665906, + "grad_norm": 2.890625, + "learning_rate": 9.480163358388584e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.8853187263011932, + "num_tokens": 341655509.0, + "step": 3195 + }, + { + "epoch": 7.280501710376283, + "grad_norm": 2.921875, + "learning_rate": 9.465402012123818e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8840262442827225, + "num_tokens": 341762697.0, + "step": 3196 + }, + { + "epoch": 7.282782212086659, + "grad_norm": 3.296875, + "learning_rate": 9.45064948299696e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.8797417432069778, + "num_tokens": 341869635.0, + "step": 3197 + }, + { + "epoch": 7.285062713797036, + "grad_norm": 2.921875, + "learning_rate": 9.435905779381265e-07, + "loss": 0.6004, + "mean_token_accuracy": 0.8786374479532242, + "num_tokens": 341975850.0, + "step": 3198 + }, + { + "epoch": 7.287343215507412, + "grad_norm": 3.3125, + "learning_rate": 9.421170909644983e-07, + "loss": 0.5555, + "mean_token_accuracy": 0.8853399753570557, + "num_tokens": 342083686.0, + "step": 3199 + }, + { + "epoch": 7.289623717217788, + "grad_norm": 3.375, + "learning_rate": 9.406444882151322e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.8812949508428574, + "num_tokens": 342191163.0, + "step": 3200 + }, + { + "epoch": 7.291904218928164, + "grad_norm": 3.09375, + "learning_rate": 9.391727705258502e-07, + "loss": 0.5411, + "mean_token_accuracy": 0.8899759203195572, + "num_tokens": 342298933.0, + "step": 3201 + }, + { + "epoch": 7.29418472063854, + "grad_norm": 5.71875, + "learning_rate": 9.377019387319705e-07, + "loss": 0.5756, + "mean_token_accuracy": 0.8839350640773773, + "num_tokens": 342405767.0, + "step": 3202 + }, + { + "epoch": 7.296465222348917, + "grad_norm": 3.0, + "learning_rate": 9.362319936683092e-07, + "loss": 0.5962, + "mean_token_accuracy": 0.8760559111833572, + "num_tokens": 342512638.0, + "step": 3203 + }, + { + "epoch": 7.298745724059293, + "grad_norm": 3.078125, + "learning_rate": 9.347629361691795e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.8815828859806061, + "num_tokens": 342619888.0, + "step": 3204 + }, + { + "epoch": 7.301026225769669, + "grad_norm": 3.953125, + "learning_rate": 9.332947670683882e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8862589299678802, + "num_tokens": 342726859.0, + "step": 3205 + }, + { + "epoch": 7.303306727480045, + "grad_norm": 2.484375, + "learning_rate": 9.318274871992408e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8866942375898361, + "num_tokens": 342834871.0, + "step": 3206 + }, + { + "epoch": 7.305587229190422, + "grad_norm": 2.953125, + "learning_rate": 9.303610973945376e-07, + "loss": 0.5552, + "mean_token_accuracy": 0.8848265260457993, + "num_tokens": 342942485.0, + "step": 3207 + }, + { + "epoch": 7.307867730900798, + "grad_norm": 3.28125, + "learning_rate": 9.288955984865717e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8879365026950836, + "num_tokens": 343049386.0, + "step": 3208 + }, + { + "epoch": 7.310148232611175, + "grad_norm": 4.1875, + "learning_rate": 9.274309913071328e-07, + "loss": 0.5866, + "mean_token_accuracy": 0.8782593309879303, + "num_tokens": 343155935.0, + "step": 3209 + }, + { + "epoch": 7.312428734321551, + "grad_norm": 3.640625, + "learning_rate": 9.259672766875044e-07, + "loss": 0.5835, + "mean_token_accuracy": 0.8784520477056503, + "num_tokens": 343262869.0, + "step": 3210 + }, + { + "epoch": 7.314709236031927, + "grad_norm": 2.640625, + "learning_rate": 9.245044554584609e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.884000301361084, + "num_tokens": 343369889.0, + "step": 3211 + }, + { + "epoch": 7.316989737742303, + "grad_norm": 3.640625, + "learning_rate": 9.230425284502725e-07, + "loss": 0.5673, + "mean_token_accuracy": 0.8830729424953461, + "num_tokens": 343476997.0, + "step": 3212 + }, + { + "epoch": 7.319270239452679, + "grad_norm": 3.8125, + "learning_rate": 9.215814964927005e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8840525597333908, + "num_tokens": 343584267.0, + "step": 3213 + }, + { + "epoch": 7.321550741163056, + "grad_norm": 3.203125, + "learning_rate": 9.201213604149989e-07, + "loss": 0.5861, + "mean_token_accuracy": 0.8782940655946732, + "num_tokens": 343690596.0, + "step": 3214 + }, + { + "epoch": 7.323831242873432, + "grad_norm": 3.03125, + "learning_rate": 9.186621210459129e-07, + "loss": 0.6009, + "mean_token_accuracy": 0.877897322177887, + "num_tokens": 343797695.0, + "step": 3215 + }, + { + "epoch": 7.326111744583809, + "grad_norm": 2.46875, + "learning_rate": 9.172037792136773e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8841624855995178, + "num_tokens": 343905377.0, + "step": 3216 + }, + { + "epoch": 7.328392246294185, + "grad_norm": 3.21875, + "learning_rate": 9.157463357460194e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.8875628858804703, + "num_tokens": 344012519.0, + "step": 3217 + }, + { + "epoch": 7.330672748004561, + "grad_norm": 3.421875, + "learning_rate": 9.142897914701565e-07, + "loss": 0.5795, + "mean_token_accuracy": 0.8802744299173355, + "num_tokens": 344120062.0, + "step": 3218 + }, + { + "epoch": 7.3329532497149374, + "grad_norm": 2.59375, + "learning_rate": 9.128341472127944e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8852846622467041, + "num_tokens": 344227315.0, + "step": 3219 + }, + { + "epoch": 7.335233751425314, + "grad_norm": 2.5, + "learning_rate": 9.113794038001298e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8799241334199905, + "num_tokens": 344334679.0, + "step": 3220 + }, + { + "epoch": 7.33751425313569, + "grad_norm": 3.1875, + "learning_rate": 9.099255620578451e-07, + "loss": 0.5621, + "mean_token_accuracy": 0.8857553601264954, + "num_tokens": 344441934.0, + "step": 3221 + }, + { + "epoch": 7.339794754846066, + "grad_norm": 2.796875, + "learning_rate": 9.084726228111141e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8808804005384445, + "num_tokens": 344548474.0, + "step": 3222 + }, + { + "epoch": 7.342075256556442, + "grad_norm": 4.1875, + "learning_rate": 9.070205868845966e-07, + "loss": 0.5668, + "mean_token_accuracy": 0.8860443085432053, + "num_tokens": 344655190.0, + "step": 3223 + }, + { + "epoch": 7.344355758266818, + "grad_norm": 5.09375, + "learning_rate": 9.055694551024402e-07, + "loss": 0.5654, + "mean_token_accuracy": 0.8791934102773666, + "num_tokens": 344762745.0, + "step": 3224 + }, + { + "epoch": 7.346636259977195, + "grad_norm": 2.625, + "learning_rate": 9.041192282882796e-07, + "loss": 0.5971, + "mean_token_accuracy": 0.8765173703432083, + "num_tokens": 344869812.0, + "step": 3225 + }, + { + "epoch": 7.348916761687571, + "grad_norm": 2.890625, + "learning_rate": 9.026699072652361e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8783349841833115, + "num_tokens": 344976836.0, + "step": 3226 + }, + { + "epoch": 7.351197263397948, + "grad_norm": 3.5625, + "learning_rate": 9.012214928559149e-07, + "loss": 0.551, + "mean_token_accuracy": 0.8875472396612167, + "num_tokens": 345084950.0, + "step": 3227 + }, + { + "epoch": 7.353477765108324, + "grad_norm": 6.34375, + "learning_rate": 8.997739858824083e-07, + "loss": 0.583, + "mean_token_accuracy": 0.8791827261447906, + "num_tokens": 345191976.0, + "step": 3228 + }, + { + "epoch": 7.3557582668187, + "grad_norm": 2.8125, + "learning_rate": 8.983273871662951e-07, + "loss": 0.587, + "mean_token_accuracy": 0.8798834979534149, + "num_tokens": 345299539.0, + "step": 3229 + }, + { + "epoch": 7.3580387685290765, + "grad_norm": 2.71875, + "learning_rate": 8.968816975286346e-07, + "loss": 0.5847, + "mean_token_accuracy": 0.8791932165622711, + "num_tokens": 345406386.0, + "step": 3230 + }, + { + "epoch": 7.360319270239453, + "grad_norm": 3.640625, + "learning_rate": 8.954369177899727e-07, + "loss": 0.5763, + "mean_token_accuracy": 0.8776541501283646, + "num_tokens": 345513832.0, + "step": 3231 + }, + { + "epoch": 7.362599771949829, + "grad_norm": 3.6875, + "learning_rate": 8.939930487703402e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8819593489170074, + "num_tokens": 345621238.0, + "step": 3232 + }, + { + "epoch": 7.364880273660205, + "grad_norm": 4.75, + "learning_rate": 8.925500912892471e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.878967210650444, + "num_tokens": 345727718.0, + "step": 3233 + }, + { + "epoch": 7.367160775370581, + "grad_norm": 2.84375, + "learning_rate": 8.911080461656893e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.885176420211792, + "num_tokens": 345834217.0, + "step": 3234 + }, + { + "epoch": 7.369441277080957, + "grad_norm": 3.4375, + "learning_rate": 8.896669142181436e-07, + "loss": 0.5766, + "mean_token_accuracy": 0.8792836219072342, + "num_tokens": 345941843.0, + "step": 3235 + }, + { + "epoch": 7.3717217787913345, + "grad_norm": 3.921875, + "learning_rate": 8.882266962645695e-07, + "loss": 0.5824, + "mean_token_accuracy": 0.8821652680635452, + "num_tokens": 346049168.0, + "step": 3236 + }, + { + "epoch": 7.374002280501711, + "grad_norm": 3.34375, + "learning_rate": 8.867873931224053e-07, + "loss": 0.5566, + "mean_token_accuracy": 0.8830807954072952, + "num_tokens": 346156521.0, + "step": 3237 + }, + { + "epoch": 7.376282782212087, + "grad_norm": 3.46875, + "learning_rate": 8.853490056085723e-07, + "loss": 0.5724, + "mean_token_accuracy": 0.8781331777572632, + "num_tokens": 346263445.0, + "step": 3238 + }, + { + "epoch": 7.378563283922463, + "grad_norm": 2.625, + "learning_rate": 8.839115345394716e-07, + "loss": 0.5608, + "mean_token_accuracy": 0.8834449648857117, + "num_tokens": 346370432.0, + "step": 3239 + }, + { + "epoch": 7.380843785632839, + "grad_norm": 3.0625, + "learning_rate": 8.824749807309846e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.8865119814872742, + "num_tokens": 346477909.0, + "step": 3240 + }, + { + "epoch": 7.3831242873432155, + "grad_norm": 3.578125, + "learning_rate": 8.810393449984706e-07, + "loss": 0.585, + "mean_token_accuracy": 0.8794442266225815, + "num_tokens": 346584653.0, + "step": 3241 + }, + { + "epoch": 7.385404789053592, + "grad_norm": 2.890625, + "learning_rate": 8.7960462815677e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8795085549354553, + "num_tokens": 346691178.0, + "step": 3242 + }, + { + "epoch": 7.387685290763968, + "grad_norm": 3.921875, + "learning_rate": 8.781708310201989e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8901244848966599, + "num_tokens": 346798766.0, + "step": 3243 + }, + { + "epoch": 7.389965792474344, + "grad_norm": 3.75, + "learning_rate": 8.767379544025531e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8783616870641708, + "num_tokens": 346905556.0, + "step": 3244 + }, + { + "epoch": 7.39224629418472, + "grad_norm": 3.078125, + "learning_rate": 8.753059991171065e-07, + "loss": 0.6058, + "mean_token_accuracy": 0.8745017796754837, + "num_tokens": 347012108.0, + "step": 3245 + }, + { + "epoch": 7.394526795895097, + "grad_norm": 3.078125, + "learning_rate": 8.738749659766085e-07, + "loss": 0.5722, + "mean_token_accuracy": 0.8834935277700424, + "num_tokens": 347119108.0, + "step": 3246 + }, + { + "epoch": 7.3968072976054735, + "grad_norm": 2.515625, + "learning_rate": 8.724448557932874e-07, + "loss": 0.5824, + "mean_token_accuracy": 0.8808663189411163, + "num_tokens": 347226523.0, + "step": 3247 + }, + { + "epoch": 7.39908779931585, + "grad_norm": 3.9375, + "learning_rate": 8.71015669378844e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.883145734667778, + "num_tokens": 347333636.0, + "step": 3248 + }, + { + "epoch": 7.401368301026226, + "grad_norm": 3.046875, + "learning_rate": 8.69587407544458e-07, + "loss": 0.5815, + "mean_token_accuracy": 0.8786642998456955, + "num_tokens": 347440255.0, + "step": 3249 + }, + { + "epoch": 7.403648802736602, + "grad_norm": 3.3125, + "learning_rate": 8.681600711007832e-07, + "loss": 0.5679, + "mean_token_accuracy": 0.8843037188053131, + "num_tokens": 347546867.0, + "step": 3250 + }, + { + "epoch": 7.405929304446978, + "grad_norm": 2.75, + "learning_rate": 8.667336608579488e-07, + "loss": 0.5645, + "mean_token_accuracy": 0.8831322491168976, + "num_tokens": 347653956.0, + "step": 3251 + }, + { + "epoch": 7.4082098061573545, + "grad_norm": 2.546875, + "learning_rate": 8.653081776255562e-07, + "loss": 0.557, + "mean_token_accuracy": 0.8858849257230759, + "num_tokens": 347761532.0, + "step": 3252 + }, + { + "epoch": 7.410490307867731, + "grad_norm": 3.796875, + "learning_rate": 8.638836222126839e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8869747668504715, + "num_tokens": 347869055.0, + "step": 3253 + }, + { + "epoch": 7.412770809578107, + "grad_norm": 5.0625, + "learning_rate": 8.624599954278803e-07, + "loss": 0.5862, + "mean_token_accuracy": 0.8779660612344742, + "num_tokens": 347975665.0, + "step": 3254 + }, + { + "epoch": 7.415051311288483, + "grad_norm": 4.0625, + "learning_rate": 8.610372980791695e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8830466419458389, + "num_tokens": 348082812.0, + "step": 3255 + }, + { + "epoch": 7.41733181299886, + "grad_norm": 6.40625, + "learning_rate": 8.59615530974047e-07, + "loss": 0.5588, + "mean_token_accuracy": 0.8859402984380722, + "num_tokens": 348190346.0, + "step": 3256 + }, + { + "epoch": 7.419612314709236, + "grad_norm": 2.515625, + "learning_rate": 8.581946949194802e-07, + "loss": 0.5904, + "mean_token_accuracy": 0.881903350353241, + "num_tokens": 348297177.0, + "step": 3257 + }, + { + "epoch": 7.4218928164196125, + "grad_norm": 2.59375, + "learning_rate": 8.56774790721909e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8854805678129196, + "num_tokens": 348404580.0, + "step": 3258 + }, + { + "epoch": 7.424173318129989, + "grad_norm": 4.03125, + "learning_rate": 8.553558191872422e-07, + "loss": 0.5739, + "mean_token_accuracy": 0.8778659850358963, + "num_tokens": 348511494.0, + "step": 3259 + }, + { + "epoch": 7.426453819840365, + "grad_norm": 3.765625, + "learning_rate": 8.539377811208613e-07, + "loss": 0.5819, + "mean_token_accuracy": 0.8794593662023544, + "num_tokens": 348618636.0, + "step": 3260 + }, + { + "epoch": 7.428734321550741, + "grad_norm": 4.15625, + "learning_rate": 8.525206773276173e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.883561447262764, + "num_tokens": 348725346.0, + "step": 3261 + }, + { + "epoch": 7.431014823261117, + "grad_norm": 2.96875, + "learning_rate": 8.511045086118311e-07, + "loss": 0.5403, + "mean_token_accuracy": 0.8908537030220032, + "num_tokens": 348832898.0, + "step": 3262 + }, + { + "epoch": 7.4332953249714935, + "grad_norm": 2.875, + "learning_rate": 8.496892757772934e-07, + "loss": 0.5582, + "mean_token_accuracy": 0.8817539364099503, + "num_tokens": 348940485.0, + "step": 3263 + }, + { + "epoch": 7.43557582668187, + "grad_norm": 2.703125, + "learning_rate": 8.482749796272613e-07, + "loss": 0.586, + "mean_token_accuracy": 0.8792066723108292, + "num_tokens": 349047156.0, + "step": 3264 + }, + { + "epoch": 7.437856328392247, + "grad_norm": 2.609375, + "learning_rate": 8.468616209644634e-07, + "loss": 0.5426, + "mean_token_accuracy": 0.8881279230117798, + "num_tokens": 349153893.0, + "step": 3265 + }, + { + "epoch": 7.440136830102623, + "grad_norm": 2.78125, + "learning_rate": 8.454492005910942e-07, + "loss": 0.5668, + "mean_token_accuracy": 0.8840852081775665, + "num_tokens": 349260811.0, + "step": 3266 + }, + { + "epoch": 7.442417331812999, + "grad_norm": 2.921875, + "learning_rate": 8.440377193088162e-07, + "loss": 0.5661, + "mean_token_accuracy": 0.8842138350009918, + "num_tokens": 349368153.0, + "step": 3267 + }, + { + "epoch": 7.444697833523375, + "grad_norm": 3.921875, + "learning_rate": 8.426271779187592e-07, + "loss": 0.5885, + "mean_token_accuracy": 0.8814673125743866, + "num_tokens": 349475704.0, + "step": 3268 + }, + { + "epoch": 7.4469783352337515, + "grad_norm": 2.703125, + "learning_rate": 8.4121757722152e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8854426443576813, + "num_tokens": 349582741.0, + "step": 3269 + }, + { + "epoch": 7.449258836944128, + "grad_norm": 2.90625, + "learning_rate": 8.398089180171592e-07, + "loss": 0.5512, + "mean_token_accuracy": 0.8872993588447571, + "num_tokens": 349689828.0, + "step": 3270 + }, + { + "epoch": 7.451539338654504, + "grad_norm": 2.640625, + "learning_rate": 8.384012011052053e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8934241831302643, + "num_tokens": 349797473.0, + "step": 3271 + }, + { + "epoch": 7.45381984036488, + "grad_norm": 4.6875, + "learning_rate": 8.369944272846522e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8873519897460938, + "num_tokens": 349904634.0, + "step": 3272 + }, + { + "epoch": 7.456100342075256, + "grad_norm": 3.109375, + "learning_rate": 8.355885973539557e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8826733976602554, + "num_tokens": 350011607.0, + "step": 3273 + }, + { + "epoch": 7.4583808437856325, + "grad_norm": 3.0, + "learning_rate": 8.341837121110386e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8846445381641388, + "num_tokens": 350119034.0, + "step": 3274 + }, + { + "epoch": 7.460661345496009, + "grad_norm": 2.5625, + "learning_rate": 8.327797723532874e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.8912030011415482, + "num_tokens": 350226528.0, + "step": 3275 + }, + { + "epoch": 7.462941847206386, + "grad_norm": 2.75, + "learning_rate": 8.313767788775498e-07, + "loss": 0.596, + "mean_token_accuracy": 0.8776167631149292, + "num_tokens": 350333295.0, + "step": 3276 + }, + { + "epoch": 7.465222348916762, + "grad_norm": 2.609375, + "learning_rate": 8.299747324801385e-07, + "loss": 0.5985, + "mean_token_accuracy": 0.8792443871498108, + "num_tokens": 350440043.0, + "step": 3277 + }, + { + "epoch": 7.467502850627138, + "grad_norm": 3.078125, + "learning_rate": 8.285736339568279e-07, + "loss": 0.5906, + "mean_token_accuracy": 0.8789392858743668, + "num_tokens": 350546875.0, + "step": 3278 + }, + { + "epoch": 7.469783352337514, + "grad_norm": 2.765625, + "learning_rate": 8.271734841028553e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8837863206863403, + "num_tokens": 350653887.0, + "step": 3279 + }, + { + "epoch": 7.4720638540478905, + "grad_norm": 2.625, + "learning_rate": 8.25774283712917e-07, + "loss": 0.5674, + "mean_token_accuracy": 0.8839046061038971, + "num_tokens": 350761695.0, + "step": 3280 + }, + { + "epoch": 7.474344355758267, + "grad_norm": 3.515625, + "learning_rate": 8.243760335811734e-07, + "loss": 0.5873, + "mean_token_accuracy": 0.8785386085510254, + "num_tokens": 350868270.0, + "step": 3281 + }, + { + "epoch": 7.476624857468643, + "grad_norm": 2.90625, + "learning_rate": 8.229787345012439e-07, + "loss": 0.5577, + "mean_token_accuracy": 0.8850255161523819, + "num_tokens": 350975311.0, + "step": 3282 + }, + { + "epoch": 7.478905359179019, + "grad_norm": 3.28125, + "learning_rate": 8.215823872662084e-07, + "loss": 0.5602, + "mean_token_accuracy": 0.8832758516073227, + "num_tokens": 351082788.0, + "step": 3283 + }, + { + "epoch": 7.481185860889395, + "grad_norm": 2.921875, + "learning_rate": 8.201869926686068e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8810236304998398, + "num_tokens": 351190001.0, + "step": 3284 + }, + { + "epoch": 7.483466362599772, + "grad_norm": 4.28125, + "learning_rate": 8.187925515004391e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.8823070824146271, + "num_tokens": 351296897.0, + "step": 3285 + }, + { + "epoch": 7.485746864310149, + "grad_norm": 4.3125, + "learning_rate": 8.173990645531612e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8813482820987701, + "num_tokens": 351404175.0, + "step": 3286 + }, + { + "epoch": 7.488027366020525, + "grad_norm": 2.9375, + "learning_rate": 8.160065326176905e-07, + "loss": 0.5748, + "mean_token_accuracy": 0.8828651309013367, + "num_tokens": 351510970.0, + "step": 3287 + }, + { + "epoch": 7.490307867730901, + "grad_norm": 3.234375, + "learning_rate": 8.14614956484401e-07, + "loss": 0.5841, + "mean_token_accuracy": 0.8815842270851135, + "num_tokens": 351617878.0, + "step": 3288 + }, + { + "epoch": 7.492588369441277, + "grad_norm": 3.5625, + "learning_rate": 8.132243369431248e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8866011053323746, + "num_tokens": 351725881.0, + "step": 3289 + }, + { + "epoch": 7.494868871151653, + "grad_norm": 3.265625, + "learning_rate": 8.11834674783151e-07, + "loss": 0.5889, + "mean_token_accuracy": 0.8792490214109421, + "num_tokens": 351832802.0, + "step": 3290 + }, + { + "epoch": 7.4971493728620295, + "grad_norm": 2.75, + "learning_rate": 8.104459707932238e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8836107850074768, + "num_tokens": 351939776.0, + "step": 3291 + }, + { + "epoch": 7.499429874572406, + "grad_norm": 3.296875, + "learning_rate": 8.090582257615456e-07, + "loss": 0.5699, + "mean_token_accuracy": 0.8809670358896255, + "num_tokens": 352046740.0, + "step": 3292 + }, + { + "epoch": 7.501710376282782, + "grad_norm": 2.546875, + "learning_rate": 8.076714404757735e-07, + "loss": 0.5593, + "mean_token_accuracy": 0.8861764520406723, + "num_tokens": 352153749.0, + "step": 3293 + }, + { + "epoch": 7.503990877993158, + "grad_norm": 3.03125, + "learning_rate": 8.062856157230209e-07, + "loss": 0.5891, + "mean_token_accuracy": 0.8781421184539795, + "num_tokens": 352260556.0, + "step": 3294 + }, + { + "epoch": 7.506271379703534, + "grad_norm": 3.921875, + "learning_rate": 8.049007522898536e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8831081688404083, + "num_tokens": 352367485.0, + "step": 3295 + }, + { + "epoch": 7.508551881413911, + "grad_norm": 4.46875, + "learning_rate": 8.035168509622948e-07, + "loss": 0.5845, + "mean_token_accuracy": 0.8832761347293854, + "num_tokens": 352473992.0, + "step": 3296 + }, + { + "epoch": 7.510832383124288, + "grad_norm": 3.375, + "learning_rate": 8.02133912525819e-07, + "loss": 0.5534, + "mean_token_accuracy": 0.8868482112884521, + "num_tokens": 352582216.0, + "step": 3297 + }, + { + "epoch": 7.513112884834664, + "grad_norm": 3.15625, + "learning_rate": 8.007519377653558e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8818371295928955, + "num_tokens": 352688608.0, + "step": 3298 + }, + { + "epoch": 7.51539338654504, + "grad_norm": 2.640625, + "learning_rate": 7.993709274652872e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8833436369895935, + "num_tokens": 352794995.0, + "step": 3299 + }, + { + "epoch": 7.517673888255416, + "grad_norm": 2.625, + "learning_rate": 7.979908824094484e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.8875368237495422, + "num_tokens": 352902959.0, + "step": 3300 + }, + { + "epoch": 7.517673888255416, + "eval_loss": 0.5862932801246643, + "eval_mean_token_accuracy": 0.8799256974753318, + "eval_num_tokens": 352902959.0, + "eval_runtime": 58.6491, + "eval_samples_per_second": 142.969, + "eval_steps_per_second": 4.484, + "step": 3300 + }, + { + "epoch": 7.519954389965792, + "grad_norm": 3.734375, + "learning_rate": 7.966118033811271e-07, + "loss": 0.5748, + "mean_token_accuracy": 0.8830474466085434, + "num_tokens": 353010544.0, + "step": 3301 + }, + { + "epoch": 7.5222348916761685, + "grad_norm": 3.21875, + "learning_rate": 7.952336911630604e-07, + "loss": 0.5655, + "mean_token_accuracy": 0.8814462274312973, + "num_tokens": 353117940.0, + "step": 3302 + }, + { + "epoch": 7.524515393386545, + "grad_norm": 3.359375, + "learning_rate": 7.938565465374384e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8806511014699936, + "num_tokens": 353225388.0, + "step": 3303 + }, + { + "epoch": 7.526795895096921, + "grad_norm": 5.21875, + "learning_rate": 7.924803702859024e-07, + "loss": 0.5648, + "mean_token_accuracy": 0.8838271647691727, + "num_tokens": 353332497.0, + "step": 3304 + }, + { + "epoch": 7.529076396807298, + "grad_norm": 2.734375, + "learning_rate": 7.911051631895433e-07, + "loss": 0.5663, + "mean_token_accuracy": 0.8843842297792435, + "num_tokens": 353439584.0, + "step": 3305 + }, + { + "epoch": 7.531356898517674, + "grad_norm": 3.296875, + "learning_rate": 7.897309260289027e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.882561057806015, + "num_tokens": 353546370.0, + "step": 3306 + }, + { + "epoch": 7.53363740022805, + "grad_norm": 3.296875, + "learning_rate": 7.883576595839698e-07, + "loss": 0.5814, + "mean_token_accuracy": 0.879010796546936, + "num_tokens": 353653493.0, + "step": 3307 + }, + { + "epoch": 7.535917901938427, + "grad_norm": 3.09375, + "learning_rate": 7.869853646341849e-07, + "loss": 0.581, + "mean_token_accuracy": 0.8826321810483932, + "num_tokens": 353759627.0, + "step": 3308 + }, + { + "epoch": 7.538198403648803, + "grad_norm": 3.796875, + "learning_rate": 7.856140419584357e-07, + "loss": 0.5969, + "mean_token_accuracy": 0.8757929354906082, + "num_tokens": 353866548.0, + "step": 3309 + }, + { + "epoch": 7.540478905359179, + "grad_norm": 2.828125, + "learning_rate": 7.842436923350591e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8822405785322189, + "num_tokens": 353973896.0, + "step": 3310 + }, + { + "epoch": 7.542759407069555, + "grad_norm": 4.25, + "learning_rate": 7.828743165418393e-07, + "loss": 0.5866, + "mean_token_accuracy": 0.8774521350860596, + "num_tokens": 354080599.0, + "step": 3311 + }, + { + "epoch": 7.545039908779931, + "grad_norm": 2.90625, + "learning_rate": 7.815059153560065e-07, + "loss": 0.5634, + "mean_token_accuracy": 0.8832540661096573, + "num_tokens": 354188075.0, + "step": 3312 + }, + { + "epoch": 7.5473204104903076, + "grad_norm": 4.5, + "learning_rate": 7.801384895542391e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8853226155042648, + "num_tokens": 354295337.0, + "step": 3313 + }, + { + "epoch": 7.549600912200685, + "grad_norm": 2.65625, + "learning_rate": 7.78772039912662e-07, + "loss": 0.5668, + "mean_token_accuracy": 0.8821288347244263, + "num_tokens": 354402522.0, + "step": 3314 + }, + { + "epoch": 7.55188141391106, + "grad_norm": 2.921875, + "learning_rate": 7.774065672068463e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8814186155796051, + "num_tokens": 354509530.0, + "step": 3315 + }, + { + "epoch": 7.554161915621437, + "grad_norm": 4.1875, + "learning_rate": 7.760420722118059e-07, + "loss": 0.566, + "mean_token_accuracy": 0.8813648819923401, + "num_tokens": 354616615.0, + "step": 3316 + }, + { + "epoch": 7.556442417331813, + "grad_norm": 3.53125, + "learning_rate": 7.746785557020034e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8848537355661392, + "num_tokens": 354724411.0, + "step": 3317 + }, + { + "epoch": 7.558722919042189, + "grad_norm": 4.5, + "learning_rate": 7.733160184513447e-07, + "loss": 0.5716, + "mean_token_accuracy": 0.8836092352867126, + "num_tokens": 354831136.0, + "step": 3318 + }, + { + "epoch": 7.561003420752566, + "grad_norm": 4.25, + "learning_rate": 7.719544612331781e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8814618289470673, + "num_tokens": 354938130.0, + "step": 3319 + }, + { + "epoch": 7.563283922462942, + "grad_norm": 2.640625, + "learning_rate": 7.705938848202985e-07, + "loss": 0.585, + "mean_token_accuracy": 0.8781514316797256, + "num_tokens": 355044678.0, + "step": 3320 + }, + { + "epoch": 7.565564424173318, + "grad_norm": 2.65625, + "learning_rate": 7.692342899849419e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.883265346288681, + "num_tokens": 355152110.0, + "step": 3321 + }, + { + "epoch": 7.567844925883694, + "grad_norm": 3.03125, + "learning_rate": 7.678756774987897e-07, + "loss": 0.5704, + "mean_token_accuracy": 0.8851431161165237, + "num_tokens": 355259414.0, + "step": 3322 + }, + { + "epoch": 7.57012542759407, + "grad_norm": 2.953125, + "learning_rate": 7.665180481329621e-07, + "loss": 0.5549, + "mean_token_accuracy": 0.8860851675271988, + "num_tokens": 355366168.0, + "step": 3323 + }, + { + "epoch": 7.572405929304447, + "grad_norm": 3.09375, + "learning_rate": 7.651614026580243e-07, + "loss": 0.5704, + "mean_token_accuracy": 0.8835692703723907, + "num_tokens": 355472724.0, + "step": 3324 + }, + { + "epoch": 7.574686431014824, + "grad_norm": 2.703125, + "learning_rate": 7.638057418439818e-07, + "loss": 0.5896, + "mean_token_accuracy": 0.8791484832763672, + "num_tokens": 355579877.0, + "step": 3325 + }, + { + "epoch": 7.5769669327252, + "grad_norm": 2.671875, + "learning_rate": 7.624510664602819e-07, + "loss": 0.5607, + "mean_token_accuracy": 0.8831581175327301, + "num_tokens": 355686885.0, + "step": 3326 + }, + { + "epoch": 7.579247434435576, + "grad_norm": 3.625, + "learning_rate": 7.610973772758118e-07, + "loss": 0.5867, + "mean_token_accuracy": 0.8818655163049698, + "num_tokens": 355793461.0, + "step": 3327 + }, + { + "epoch": 7.581527936145952, + "grad_norm": 3.0, + "learning_rate": 7.597446750589005e-07, + "loss": 0.579, + "mean_token_accuracy": 0.8854977488517761, + "num_tokens": 355900140.0, + "step": 3328 + }, + { + "epoch": 7.583808437856328, + "grad_norm": 3.4375, + "learning_rate": 7.583929605773138e-07, + "loss": 0.5739, + "mean_token_accuracy": 0.8835340738296509, + "num_tokens": 356007033.0, + "step": 3329 + }, + { + "epoch": 7.586088939566705, + "grad_norm": 2.53125, + "learning_rate": 7.570422345982598e-07, + "loss": 0.558, + "mean_token_accuracy": 0.8873008489608765, + "num_tokens": 356114209.0, + "step": 3330 + }, + { + "epoch": 7.588369441277081, + "grad_norm": 3.75, + "learning_rate": 7.556924978883843e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.8829249292612076, + "num_tokens": 356220979.0, + "step": 3331 + }, + { + "epoch": 7.590649942987457, + "grad_norm": 3.8125, + "learning_rate": 7.543437512137717e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8828289657831192, + "num_tokens": 356327873.0, + "step": 3332 + }, + { + "epoch": 7.592930444697833, + "grad_norm": 3.15625, + "learning_rate": 7.529959953399455e-07, + "loss": 0.5503, + "mean_token_accuracy": 0.8852836340665817, + "num_tokens": 356435610.0, + "step": 3333 + }, + { + "epoch": 7.59521094640821, + "grad_norm": 3.21875, + "learning_rate": 7.516492310318643e-07, + "loss": 0.5862, + "mean_token_accuracy": 0.8795439153909683, + "num_tokens": 356542182.0, + "step": 3334 + }, + { + "epoch": 7.5974914481185865, + "grad_norm": 2.703125, + "learning_rate": 7.503034590539266e-07, + "loss": 0.5845, + "mean_token_accuracy": 0.88029345870018, + "num_tokens": 356648911.0, + "step": 3335 + }, + { + "epoch": 7.599771949828963, + "grad_norm": 3.65625, + "learning_rate": 7.489586801699661e-07, + "loss": 0.5624, + "mean_token_accuracy": 0.8867314010858536, + "num_tokens": 356755662.0, + "step": 3336 + }, + { + "epoch": 7.602052451539339, + "grad_norm": 2.265625, + "learning_rate": 7.476148951432543e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8867203295230865, + "num_tokens": 356862489.0, + "step": 3337 + }, + { + "epoch": 7.604332953249715, + "grad_norm": 3.375, + "learning_rate": 7.462721047364965e-07, + "loss": 0.5796, + "mean_token_accuracy": 0.8798790127038956, + "num_tokens": 356969949.0, + "step": 3338 + }, + { + "epoch": 7.606613454960091, + "grad_norm": 3.671875, + "learning_rate": 7.449303097118355e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8800079971551895, + "num_tokens": 357077056.0, + "step": 3339 + }, + { + "epoch": 7.608893956670467, + "grad_norm": 2.78125, + "learning_rate": 7.435895108308472e-07, + "loss": 0.5725, + "mean_token_accuracy": 0.8835026770830154, + "num_tokens": 357184699.0, + "step": 3340 + }, + { + "epoch": 7.611174458380844, + "grad_norm": 2.90625, + "learning_rate": 7.422497088545436e-07, + "loss": 0.5782, + "mean_token_accuracy": 0.8809027224779129, + "num_tokens": 357291871.0, + "step": 3341 + }, + { + "epoch": 7.61345496009122, + "grad_norm": 2.578125, + "learning_rate": 7.409109045433704e-07, + "loss": 0.5732, + "mean_token_accuracy": 0.8787789940834045, + "num_tokens": 357399018.0, + "step": 3342 + }, + { + "epoch": 7.615735461801596, + "grad_norm": 3.15625, + "learning_rate": 7.395730986572075e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.8798029869794846, + "num_tokens": 357505735.0, + "step": 3343 + }, + { + "epoch": 7.618015963511972, + "grad_norm": 2.546875, + "learning_rate": 7.382362919553682e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.8806295096874237, + "num_tokens": 357613067.0, + "step": 3344 + }, + { + "epoch": 7.620296465222349, + "grad_norm": 3.296875, + "learning_rate": 7.369004851965966e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8788366466760635, + "num_tokens": 357720663.0, + "step": 3345 + }, + { + "epoch": 7.6225769669327255, + "grad_norm": 2.921875, + "learning_rate": 7.355656791390717e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8791816085577011, + "num_tokens": 357828239.0, + "step": 3346 + }, + { + "epoch": 7.624857468643102, + "grad_norm": 4.09375, + "learning_rate": 7.342318745404034e-07, + "loss": 0.573, + "mean_token_accuracy": 0.8827292621135712, + "num_tokens": 357935648.0, + "step": 3347 + }, + { + "epoch": 7.627137970353478, + "grad_norm": 3.046875, + "learning_rate": 7.32899072157634e-07, + "loss": 0.5635, + "mean_token_accuracy": 0.8853467255830765, + "num_tokens": 358042684.0, + "step": 3348 + }, + { + "epoch": 7.629418472063854, + "grad_norm": 4.96875, + "learning_rate": 7.315672727472365e-07, + "loss": 0.5439, + "mean_token_accuracy": 0.8852219432592392, + "num_tokens": 358150191.0, + "step": 3349 + }, + { + "epoch": 7.63169897377423, + "grad_norm": 3.125, + "learning_rate": 7.302364770651132e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.888086274266243, + "num_tokens": 358257157.0, + "step": 3350 + }, + { + "epoch": 7.633979475484606, + "grad_norm": 3.28125, + "learning_rate": 7.289066858665991e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.8860320299863815, + "num_tokens": 358364536.0, + "step": 3351 + }, + { + "epoch": 7.636259977194983, + "grad_norm": 2.734375, + "learning_rate": 7.275778999064578e-07, + "loss": 0.5927, + "mean_token_accuracy": 0.8757172375917435, + "num_tokens": 358471817.0, + "step": 3352 + }, + { + "epoch": 7.638540478905359, + "grad_norm": 2.59375, + "learning_rate": 7.262501199388827e-07, + "loss": 0.5552, + "mean_token_accuracy": 0.8869472742080688, + "num_tokens": 358578963.0, + "step": 3353 + }, + { + "epoch": 7.640820980615736, + "grad_norm": 2.546875, + "learning_rate": 7.249233467174965e-07, + "loss": 0.5882, + "mean_token_accuracy": 0.8762264847755432, + "num_tokens": 358685361.0, + "step": 3354 + }, + { + "epoch": 7.643101482326112, + "grad_norm": 4.40625, + "learning_rate": 7.235975809953491e-07, + "loss": 0.55, + "mean_token_accuracy": 0.8874974101781845, + "num_tokens": 358792677.0, + "step": 3355 + }, + { + "epoch": 7.645381984036488, + "grad_norm": 4.125, + "learning_rate": 7.222728235249196e-07, + "loss": 0.5887, + "mean_token_accuracy": 0.8794031143188477, + "num_tokens": 358899865.0, + "step": 3356 + }, + { + "epoch": 7.6476624857468645, + "grad_norm": 2.828125, + "learning_rate": 7.209490750581152e-07, + "loss": 0.553, + "mean_token_accuracy": 0.8865524530410767, + "num_tokens": 359006856.0, + "step": 3357 + }, + { + "epoch": 7.649942987457241, + "grad_norm": 2.921875, + "learning_rate": 7.196263363462699e-07, + "loss": 0.575, + "mean_token_accuracy": 0.88320592045784, + "num_tokens": 359113787.0, + "step": 3358 + }, + { + "epoch": 7.652223489167617, + "grad_norm": 2.796875, + "learning_rate": 7.183046081401454e-07, + "loss": 0.565, + "mean_token_accuracy": 0.8821183294057846, + "num_tokens": 359220641.0, + "step": 3359 + }, + { + "epoch": 7.654503990877993, + "grad_norm": 3.328125, + "learning_rate": 7.169838911899276e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.8826924860477448, + "num_tokens": 359327949.0, + "step": 3360 + }, + { + "epoch": 7.656784492588369, + "grad_norm": 3.640625, + "learning_rate": 7.156641862452316e-07, + "loss": 0.5937, + "mean_token_accuracy": 0.8792467415332794, + "num_tokens": 359434539.0, + "step": 3361 + }, + { + "epoch": 7.659064994298745, + "grad_norm": 5.59375, + "learning_rate": 7.143454940550948e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.8802933245897293, + "num_tokens": 359541747.0, + "step": 3362 + }, + { + "epoch": 7.661345496009122, + "grad_norm": 2.953125, + "learning_rate": 7.13027815367982e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8832086026668549, + "num_tokens": 359648549.0, + "step": 3363 + }, + { + "epoch": 7.663625997719498, + "grad_norm": 4.5, + "learning_rate": 7.117111509317823e-07, + "loss": 0.5735, + "mean_token_accuracy": 0.8798936158418655, + "num_tokens": 359755583.0, + "step": 3364 + }, + { + "epoch": 7.665906499429875, + "grad_norm": 5.09375, + "learning_rate": 7.103955014938099e-07, + "loss": 0.5509, + "mean_token_accuracy": 0.885413408279419, + "num_tokens": 359862616.0, + "step": 3365 + }, + { + "epoch": 7.668187001140251, + "grad_norm": 3.125, + "learning_rate": 7.090808678008005e-07, + "loss": 0.5825, + "mean_token_accuracy": 0.8743928670883179, + "num_tokens": 359969068.0, + "step": 3366 + }, + { + "epoch": 7.670467502850627, + "grad_norm": 4.375, + "learning_rate": 7.077672505989155e-07, + "loss": 0.5938, + "mean_token_accuracy": 0.8788380324840546, + "num_tokens": 360076499.0, + "step": 3367 + }, + { + "epoch": 7.6727480045610035, + "grad_norm": 3.09375, + "learning_rate": 7.064546506337386e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8831783980131149, + "num_tokens": 360183723.0, + "step": 3368 + }, + { + "epoch": 7.67502850627138, + "grad_norm": 3.5625, + "learning_rate": 7.051430686502764e-07, + "loss": 0.5763, + "mean_token_accuracy": 0.8796802908182144, + "num_tokens": 360291600.0, + "step": 3369 + }, + { + "epoch": 7.677309007981756, + "grad_norm": 3.671875, + "learning_rate": 7.038325053929582e-07, + "loss": 0.5892, + "mean_token_accuracy": 0.8816768825054169, + "num_tokens": 360398736.0, + "step": 3370 + }, + { + "epoch": 7.679589509692132, + "grad_norm": 4.3125, + "learning_rate": 7.025229616056326e-07, + "loss": 0.5799, + "mean_token_accuracy": 0.8814764469861984, + "num_tokens": 360505355.0, + "step": 3371 + }, + { + "epoch": 7.681870011402508, + "grad_norm": 2.9375, + "learning_rate": 7.012144380315724e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8859875947237015, + "num_tokens": 360612219.0, + "step": 3372 + }, + { + "epoch": 7.684150513112884, + "grad_norm": 3.640625, + "learning_rate": 6.999069354134703e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8861010521650314, + "num_tokens": 360720165.0, + "step": 3373 + }, + { + "epoch": 7.6864310148232615, + "grad_norm": 2.578125, + "learning_rate": 6.986004544934394e-07, + "loss": 0.588, + "mean_token_accuracy": 0.8770526051521301, + "num_tokens": 360827091.0, + "step": 3374 + }, + { + "epoch": 7.688711516533638, + "grad_norm": 3.0, + "learning_rate": 6.972949960130135e-07, + "loss": 0.5684, + "mean_token_accuracy": 0.8841615915298462, + "num_tokens": 360934168.0, + "step": 3375 + }, + { + "epoch": 7.690992018244014, + "grad_norm": 3.71875, + "learning_rate": 6.959905607131457e-07, + "loss": 0.5652, + "mean_token_accuracy": 0.8876559138298035, + "num_tokens": 361040859.0, + "step": 3376 + }, + { + "epoch": 7.69327251995439, + "grad_norm": 3.828125, + "learning_rate": 6.946871493342072e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.8838866651058197, + "num_tokens": 361148098.0, + "step": 3377 + }, + { + "epoch": 7.695553021664766, + "grad_norm": 3.3125, + "learning_rate": 6.933847626159898e-07, + "loss": 0.5684, + "mean_token_accuracy": 0.8839807361364365, + "num_tokens": 361255309.0, + "step": 3378 + }, + { + "epoch": 7.6978335233751425, + "grad_norm": 3.078125, + "learning_rate": 6.920834012977032e-07, + "loss": 0.5935, + "mean_token_accuracy": 0.878353163599968, + "num_tokens": 361362125.0, + "step": 3379 + }, + { + "epoch": 7.700114025085519, + "grad_norm": 2.9375, + "learning_rate": 6.907830661179757e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8843338489532471, + "num_tokens": 361468866.0, + "step": 3380 + }, + { + "epoch": 7.702394526795895, + "grad_norm": 2.5, + "learning_rate": 6.894837578148505e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8871625512838364, + "num_tokens": 361576767.0, + "step": 3381 + }, + { + "epoch": 7.704675028506271, + "grad_norm": 2.953125, + "learning_rate": 6.881854771257912e-07, + "loss": 0.5575, + "mean_token_accuracy": 0.8831998556852341, + "num_tokens": 361683691.0, + "step": 3382 + }, + { + "epoch": 7.706955530216648, + "grad_norm": 3.125, + "learning_rate": 6.868882247876776e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8782538622617722, + "num_tokens": 361790388.0, + "step": 3383 + }, + { + "epoch": 7.7092360319270234, + "grad_norm": 3.75, + "learning_rate": 6.855920015368032e-07, + "loss": 0.5964, + "mean_token_accuracy": 0.8798200190067291, + "num_tokens": 361897277.0, + "step": 3384 + }, + { + "epoch": 7.7115165336374005, + "grad_norm": 3.265625, + "learning_rate": 6.8429680810888e-07, + "loss": 0.5862, + "mean_token_accuracy": 0.882276862859726, + "num_tokens": 362003897.0, + "step": 3385 + }, + { + "epoch": 7.713797035347777, + "grad_norm": 2.859375, + "learning_rate": 6.830026452390354e-07, + "loss": 0.5599, + "mean_token_accuracy": 0.8820279538631439, + "num_tokens": 362111309.0, + "step": 3386 + }, + { + "epoch": 7.716077537058153, + "grad_norm": 2.71875, + "learning_rate": 6.817095136618113e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8817304819822311, + "num_tokens": 362218467.0, + "step": 3387 + }, + { + "epoch": 7.718358038768529, + "grad_norm": 3.890625, + "learning_rate": 6.804174141111631e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8817280679941177, + "num_tokens": 362326122.0, + "step": 3388 + }, + { + "epoch": 7.720638540478905, + "grad_norm": 3.765625, + "learning_rate": 6.791263473204624e-07, + "loss": 0.586, + "mean_token_accuracy": 0.878828227519989, + "num_tokens": 362433157.0, + "step": 3389 + }, + { + "epoch": 7.7229190421892815, + "grad_norm": 2.625, + "learning_rate": 6.778363140224933e-07, + "loss": 0.5615, + "mean_token_accuracy": 0.8845448940992355, + "num_tokens": 362540668.0, + "step": 3390 + }, + { + "epoch": 7.725199543899658, + "grad_norm": 3.46875, + "learning_rate": 6.765473149494545e-07, + "loss": 0.56, + "mean_token_accuracy": 0.884161964058876, + "num_tokens": 362648198.0, + "step": 3391 + }, + { + "epoch": 7.727480045610034, + "grad_norm": 3.75, + "learning_rate": 6.752593508329572e-07, + "loss": 0.567, + "mean_token_accuracy": 0.8821451961994171, + "num_tokens": 362755339.0, + "step": 3392 + }, + { + "epoch": 7.72976054732041, + "grad_norm": 2.625, + "learning_rate": 6.739724224040236e-07, + "loss": 0.5711, + "mean_token_accuracy": 0.8820271492004395, + "num_tokens": 362862783.0, + "step": 3393 + }, + { + "epoch": 7.732041049030787, + "grad_norm": 3.203125, + "learning_rate": 6.726865303930905e-07, + "loss": 0.5809, + "mean_token_accuracy": 0.8791635781526566, + "num_tokens": 362969748.0, + "step": 3394 + }, + { + "epoch": 7.734321550741163, + "grad_norm": 2.5625, + "learning_rate": 6.714016755300048e-07, + "loss": 0.5938, + "mean_token_accuracy": 0.8794443905353546, + "num_tokens": 363076599.0, + "step": 3395 + }, + { + "epoch": 7.7366020524515395, + "grad_norm": 3.40625, + "learning_rate": 6.701178585440257e-07, + "loss": 0.5509, + "mean_token_accuracy": 0.8836392611265182, + "num_tokens": 363184034.0, + "step": 3396 + }, + { + "epoch": 7.738882554161916, + "grad_norm": 2.75, + "learning_rate": 6.688350801638235e-07, + "loss": 0.5852, + "mean_token_accuracy": 0.8802366703748703, + "num_tokens": 363290837.0, + "step": 3397 + }, + { + "epoch": 7.741163055872292, + "grad_norm": 4.84375, + "learning_rate": 6.67553341117477e-07, + "loss": 0.5755, + "mean_token_accuracy": 0.8817736506462097, + "num_tokens": 363398189.0, + "step": 3398 + }, + { + "epoch": 7.743443557582668, + "grad_norm": 2.9375, + "learning_rate": 6.662726421324775e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8845831155776978, + "num_tokens": 363505198.0, + "step": 3399 + }, + { + "epoch": 7.745724059293044, + "grad_norm": 4.84375, + "learning_rate": 6.649929839357247e-07, + "loss": 0.5612, + "mean_token_accuracy": 0.8841813802719116, + "num_tokens": 363612622.0, + "step": 3400 + }, + { + "epoch": 7.7480045610034205, + "grad_norm": 4.1875, + "learning_rate": 6.637143672535282e-07, + "loss": 0.561, + "mean_token_accuracy": 0.8850229233503342, + "num_tokens": 363719127.0, + "step": 3401 + }, + { + "epoch": 7.750285062713797, + "grad_norm": 3.078125, + "learning_rate": 6.624367928116066e-07, + "loss": 0.5797, + "mean_token_accuracy": 0.8808367401361465, + "num_tokens": 363826026.0, + "step": 3402 + }, + { + "epoch": 7.752565564424174, + "grad_norm": 4.65625, + "learning_rate": 6.611602613350854e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.8835535049438477, + "num_tokens": 363933444.0, + "step": 3403 + }, + { + "epoch": 7.75484606613455, + "grad_norm": 2.78125, + "learning_rate": 6.598847735485001e-07, + "loss": 0.5656, + "mean_token_accuracy": 0.8848689496517181, + "num_tokens": 364040465.0, + "step": 3404 + }, + { + "epoch": 7.757126567844926, + "grad_norm": 3.921875, + "learning_rate": 6.586103301757918e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.880999892950058, + "num_tokens": 364147442.0, + "step": 3405 + }, + { + "epoch": 7.759407069555302, + "grad_norm": 3.765625, + "learning_rate": 6.573369319403108e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.883205309510231, + "num_tokens": 364255015.0, + "step": 3406 + }, + { + "epoch": 7.7616875712656785, + "grad_norm": 3.59375, + "learning_rate": 6.560645795648132e-07, + "loss": 0.5716, + "mean_token_accuracy": 0.8813123106956482, + "num_tokens": 364362027.0, + "step": 3407 + }, + { + "epoch": 7.763968072976055, + "grad_norm": 3.609375, + "learning_rate": 6.547932737714624e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.8851943165063858, + "num_tokens": 364469642.0, + "step": 3408 + }, + { + "epoch": 7.766248574686431, + "grad_norm": 3.484375, + "learning_rate": 6.535230152818256e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.8846663683652878, + "num_tokens": 364576740.0, + "step": 3409 + }, + { + "epoch": 7.768529076396807, + "grad_norm": 3.6875, + "learning_rate": 6.522538048168777e-07, + "loss": 0.5721, + "mean_token_accuracy": 0.8818811625242233, + "num_tokens": 364683980.0, + "step": 3410 + }, + { + "epoch": 7.770809578107183, + "grad_norm": 4.71875, + "learning_rate": 6.509856430969982e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.8817369639873505, + "num_tokens": 364790604.0, + "step": 3411 + }, + { + "epoch": 7.7730900798175595, + "grad_norm": 4.71875, + "learning_rate": 6.49718530841971e-07, + "loss": 0.5796, + "mean_token_accuracy": 0.876135990023613, + "num_tokens": 364897881.0, + "step": 3412 + }, + { + "epoch": 7.775370581527936, + "grad_norm": 4.375, + "learning_rate": 6.484524687709853e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.8819546699523926, + "num_tokens": 365004584.0, + "step": 3413 + }, + { + "epoch": 7.777651083238313, + "grad_norm": 2.640625, + "learning_rate": 6.471874576026321e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.8798770606517792, + "num_tokens": 365111502.0, + "step": 3414 + }, + { + "epoch": 7.779931584948689, + "grad_norm": 3.34375, + "learning_rate": 6.459234980549081e-07, + "loss": 0.5942, + "mean_token_accuracy": 0.8755921274423599, + "num_tokens": 365217932.0, + "step": 3415 + }, + { + "epoch": 7.782212086659065, + "grad_norm": 3.3125, + "learning_rate": 6.446605908452122e-07, + "loss": 0.573, + "mean_token_accuracy": 0.8808364421129227, + "num_tokens": 365324803.0, + "step": 3416 + }, + { + "epoch": 7.784492588369441, + "grad_norm": 3.0, + "learning_rate": 6.433987366903461e-07, + "loss": 0.5739, + "mean_token_accuracy": 0.8788796812295914, + "num_tokens": 365432029.0, + "step": 3417 + }, + { + "epoch": 7.7867730900798175, + "grad_norm": 2.765625, + "learning_rate": 6.421379363065142e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.8842990696430206, + "num_tokens": 365539725.0, + "step": 3418 + }, + { + "epoch": 7.789053591790194, + "grad_norm": 2.78125, + "learning_rate": 6.408781904093228e-07, + "loss": 0.585, + "mean_token_accuracy": 0.8797939419746399, + "num_tokens": 365646379.0, + "step": 3419 + }, + { + "epoch": 7.79133409350057, + "grad_norm": 3.21875, + "learning_rate": 6.39619499713778e-07, + "loss": 0.5456, + "mean_token_accuracy": 0.8892961293458939, + "num_tokens": 365753657.0, + "step": 3420 + }, + { + "epoch": 7.793614595210946, + "grad_norm": 2.796875, + "learning_rate": 6.383618649342894e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8814832419157028, + "num_tokens": 365859878.0, + "step": 3421 + }, + { + "epoch": 7.795895096921322, + "grad_norm": 3.40625, + "learning_rate": 6.371052867846658e-07, + "loss": 0.5887, + "mean_token_accuracy": 0.8824429661035538, + "num_tokens": 365966802.0, + "step": 3422 + }, + { + "epoch": 7.798175598631699, + "grad_norm": 3.234375, + "learning_rate": 6.358497659781177e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8872740119695663, + "num_tokens": 366073841.0, + "step": 3423 + }, + { + "epoch": 7.800456100342076, + "grad_norm": 4.1875, + "learning_rate": 6.345953032272525e-07, + "loss": 0.5757, + "mean_token_accuracy": 0.8818953335285187, + "num_tokens": 366180970.0, + "step": 3424 + }, + { + "epoch": 7.802736602052452, + "grad_norm": 3.3125, + "learning_rate": 6.333418992440804e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8802995383739471, + "num_tokens": 366287962.0, + "step": 3425 + }, + { + "epoch": 7.805017103762828, + "grad_norm": 3.015625, + "learning_rate": 6.3208955474001e-07, + "loss": 0.5699, + "mean_token_accuracy": 0.8855053037405014, + "num_tokens": 366395208.0, + "step": 3426 + }, + { + "epoch": 7.807297605473204, + "grad_norm": 2.71875, + "learning_rate": 6.308382704258459e-07, + "loss": 0.5819, + "mean_token_accuracy": 0.8811968117952347, + "num_tokens": 366502174.0, + "step": 3427 + }, + { + "epoch": 7.80957810718358, + "grad_norm": 5.4375, + "learning_rate": 6.29588047011794e-07, + "loss": 0.5882, + "mean_token_accuracy": 0.8795420229434967, + "num_tokens": 366609589.0, + "step": 3428 + }, + { + "epoch": 7.811858608893957, + "grad_norm": 2.375, + "learning_rate": 6.283388852074576e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8850322365760803, + "num_tokens": 366716534.0, + "step": 3429 + }, + { + "epoch": 7.814139110604333, + "grad_norm": 2.703125, + "learning_rate": 6.270907857218356e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.882201686501503, + "num_tokens": 366823931.0, + "step": 3430 + }, + { + "epoch": 7.816419612314709, + "grad_norm": 3.265625, + "learning_rate": 6.258437492633254e-07, + "loss": 0.552, + "mean_token_accuracy": 0.887025773525238, + "num_tokens": 366930916.0, + "step": 3431 + }, + { + "epoch": 7.818700114025085, + "grad_norm": 2.484375, + "learning_rate": 6.245977765397216e-07, + "loss": 0.5631, + "mean_token_accuracy": 0.8866972625255585, + "num_tokens": 367038203.0, + "step": 3432 + }, + { + "epoch": 7.820980615735461, + "grad_norm": 3.28125, + "learning_rate": 6.233528682582132e-07, + "loss": 0.5618, + "mean_token_accuracy": 0.8818273693323135, + "num_tokens": 367145120.0, + "step": 3433 + }, + { + "epoch": 7.823261117445838, + "grad_norm": 3.796875, + "learning_rate": 6.221090251253872e-07, + "loss": 0.5571, + "mean_token_accuracy": 0.8845161199569702, + "num_tokens": 367252872.0, + "step": 3434 + }, + { + "epoch": 7.825541619156215, + "grad_norm": 2.625, + "learning_rate": 6.208662478472249e-07, + "loss": 0.5619, + "mean_token_accuracy": 0.8831181675195694, + "num_tokens": 367359928.0, + "step": 3435 + }, + { + "epoch": 7.827822120866591, + "grad_norm": 2.53125, + "learning_rate": 6.196245371291015e-07, + "loss": 0.566, + "mean_token_accuracy": 0.8835492432117462, + "num_tokens": 367467008.0, + "step": 3436 + }, + { + "epoch": 7.830102622576967, + "grad_norm": 3.265625, + "learning_rate": 6.183838936757891e-07, + "loss": 0.5847, + "mean_token_accuracy": 0.8792263269424438, + "num_tokens": 367574251.0, + "step": 3437 + }, + { + "epoch": 7.832383124287343, + "grad_norm": 2.84375, + "learning_rate": 6.171443181914524e-07, + "loss": 0.5484, + "mean_token_accuracy": 0.8878850936889648, + "num_tokens": 367682131.0, + "step": 3438 + }, + { + "epoch": 7.834663625997719, + "grad_norm": 3.15625, + "learning_rate": 6.159058113796507e-07, + "loss": 0.5745, + "mean_token_accuracy": 0.8801481425762177, + "num_tokens": 367789432.0, + "step": 3439 + }, + { + "epoch": 7.836944127708096, + "grad_norm": 2.71875, + "learning_rate": 6.146683739433374e-07, + "loss": 0.5901, + "mean_token_accuracy": 0.8798154145479202, + "num_tokens": 367896299.0, + "step": 3440 + }, + { + "epoch": 7.839224629418472, + "grad_norm": 2.8125, + "learning_rate": 6.134320065848564e-07, + "loss": 0.5795, + "mean_token_accuracy": 0.8814557641744614, + "num_tokens": 368003358.0, + "step": 3441 + }, + { + "epoch": 7.841505131128848, + "grad_norm": 2.828125, + "learning_rate": 6.121967100059473e-07, + "loss": 0.5912, + "mean_token_accuracy": 0.8781363666057587, + "num_tokens": 368110282.0, + "step": 3442 + }, + { + "epoch": 7.843785632839225, + "grad_norm": 4.28125, + "learning_rate": 6.109624849077397e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8851889967918396, + "num_tokens": 368217316.0, + "step": 3443 + }, + { + "epoch": 7.846066134549601, + "grad_norm": 2.84375, + "learning_rate": 6.097293319907566e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8824188113212585, + "num_tokens": 368324078.0, + "step": 3444 + }, + { + "epoch": 7.848346636259977, + "grad_norm": 3.390625, + "learning_rate": 6.084972519549123e-07, + "loss": 0.5895, + "mean_token_accuracy": 0.8789727538824081, + "num_tokens": 368430695.0, + "step": 3445 + }, + { + "epoch": 7.850627137970354, + "grad_norm": 4.78125, + "learning_rate": 6.072662454995101e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.8788609057664871, + "num_tokens": 368537559.0, + "step": 3446 + }, + { + "epoch": 7.85290763968073, + "grad_norm": 3.28125, + "learning_rate": 6.060363133232472e-07, + "loss": 0.5585, + "mean_token_accuracy": 0.8846858441829681, + "num_tokens": 368645105.0, + "step": 3447 + }, + { + "epoch": 7.855188141391106, + "grad_norm": 3.421875, + "learning_rate": 6.048074561242076e-07, + "loss": 0.5778, + "mean_token_accuracy": 0.8781778067350388, + "num_tokens": 368752515.0, + "step": 3448 + }, + { + "epoch": 7.857468643101482, + "grad_norm": 2.5625, + "learning_rate": 6.035796745998679e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.8889462649822235, + "num_tokens": 368859963.0, + "step": 3449 + }, + { + "epoch": 7.859749144811858, + "grad_norm": 3.265625, + "learning_rate": 6.023529694470931e-07, + "loss": 0.568, + "mean_token_accuracy": 0.8810675591230392, + "num_tokens": 368966665.0, + "step": 3450 + }, + { + "epoch": 7.862029646522235, + "grad_norm": 3.0625, + "learning_rate": 6.01127341362138e-07, + "loss": 0.57, + "mean_token_accuracy": 0.8824652433395386, + "num_tokens": 369073545.0, + "step": 3451 + }, + { + "epoch": 7.864310148232612, + "grad_norm": 2.78125, + "learning_rate": 5.999027910406441e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8798021376132965, + "num_tokens": 369180613.0, + "step": 3452 + }, + { + "epoch": 7.866590649942988, + "grad_norm": 3.390625, + "learning_rate": 5.98679319177643e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.8812180906534195, + "num_tokens": 369287568.0, + "step": 3453 + }, + { + "epoch": 7.868871151653364, + "grad_norm": 3.125, + "learning_rate": 5.974569264675542e-07, + "loss": 0.5585, + "mean_token_accuracy": 0.8855846971273422, + "num_tokens": 369394613.0, + "step": 3454 + }, + { + "epoch": 7.87115165336374, + "grad_norm": 3.25, + "learning_rate": 5.962356136041835e-07, + "loss": 0.5601, + "mean_token_accuracy": 0.8867956697940826, + "num_tokens": 369501322.0, + "step": 3455 + }, + { + "epoch": 7.873432155074116, + "grad_norm": 3.265625, + "learning_rate": 5.95015381280726e-07, + "loss": 0.5581, + "mean_token_accuracy": 0.8840297311544418, + "num_tokens": 369608680.0, + "step": 3456 + }, + { + "epoch": 7.875712656784493, + "grad_norm": 3.46875, + "learning_rate": 5.937962301897604e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8828248530626297, + "num_tokens": 369716370.0, + "step": 3457 + }, + { + "epoch": 7.877993158494869, + "grad_norm": 2.9375, + "learning_rate": 5.925781610232534e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.8825751543045044, + "num_tokens": 369823697.0, + "step": 3458 + }, + { + "epoch": 7.880273660205245, + "grad_norm": 4.5625, + "learning_rate": 5.913611744725584e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8807247132062912, + "num_tokens": 369931399.0, + "step": 3459 + }, + { + "epoch": 7.882554161915621, + "grad_norm": 3.109375, + "learning_rate": 5.901452712284128e-07, + "loss": 0.584, + "mean_token_accuracy": 0.881399855017662, + "num_tokens": 370038799.0, + "step": 3460 + }, + { + "epoch": 7.884834663625997, + "grad_norm": 5.0, + "learning_rate": 5.889304519809402e-07, + "loss": 0.5778, + "mean_token_accuracy": 0.8788302540779114, + "num_tokens": 370146637.0, + "step": 3461 + }, + { + "epoch": 7.887115165336374, + "grad_norm": 2.84375, + "learning_rate": 5.877167174196491e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8876172602176666, + "num_tokens": 370253754.0, + "step": 3462 + }, + { + "epoch": 7.889395667046751, + "grad_norm": 3.015625, + "learning_rate": 5.865040682334303e-07, + "loss": 0.5761, + "mean_token_accuracy": 0.8796623796224594, + "num_tokens": 370360287.0, + "step": 3463 + }, + { + "epoch": 7.891676168757127, + "grad_norm": 3.875, + "learning_rate": 5.852925051105609e-07, + "loss": 0.5755, + "mean_token_accuracy": 0.8807369768619537, + "num_tokens": 370467034.0, + "step": 3464 + }, + { + "epoch": 7.893956670467503, + "grad_norm": 2.84375, + "learning_rate": 5.840820287387009e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.8808783739805222, + "num_tokens": 370573928.0, + "step": 3465 + }, + { + "epoch": 7.896237172177879, + "grad_norm": 2.953125, + "learning_rate": 5.828726398048939e-07, + "loss": 0.5845, + "mean_token_accuracy": 0.8822729289531708, + "num_tokens": 370680253.0, + "step": 3466 + }, + { + "epoch": 7.898517673888255, + "grad_norm": 2.984375, + "learning_rate": 5.816643389955642e-07, + "loss": 0.5698, + "mean_token_accuracy": 0.8849064260721207, + "num_tokens": 370786997.0, + "step": 3467 + }, + { + "epoch": 7.900798175598632, + "grad_norm": 3.5, + "learning_rate": 5.804571269965206e-07, + "loss": 0.5778, + "mean_token_accuracy": 0.8811779320240021, + "num_tokens": 370893845.0, + "step": 3468 + }, + { + "epoch": 7.903078677309008, + "grad_norm": 4.5625, + "learning_rate": 5.792510044929545e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8799442201852798, + "num_tokens": 371000695.0, + "step": 3469 + }, + { + "epoch": 7.905359179019384, + "grad_norm": 2.609375, + "learning_rate": 5.780459721694359e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.8812065422534943, + "num_tokens": 371107589.0, + "step": 3470 + }, + { + "epoch": 7.90763968072976, + "grad_norm": 3.203125, + "learning_rate": 5.768420307099188e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8848077207803726, + "num_tokens": 371214705.0, + "step": 3471 + }, + { + "epoch": 7.909920182440137, + "grad_norm": 3.28125, + "learning_rate": 5.756391807977377e-07, + "loss": 0.5721, + "mean_token_accuracy": 0.8829392790794373, + "num_tokens": 371321954.0, + "step": 3472 + }, + { + "epoch": 7.9122006841505135, + "grad_norm": 3.84375, + "learning_rate": 5.744374231156056e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.87844018638134, + "num_tokens": 371428415.0, + "step": 3473 + }, + { + "epoch": 7.91448118586089, + "grad_norm": 4.96875, + "learning_rate": 5.732367583456177e-07, + "loss": 0.5671, + "mean_token_accuracy": 0.879816859960556, + "num_tokens": 371535322.0, + "step": 3474 + }, + { + "epoch": 7.916761687571266, + "grad_norm": 2.625, + "learning_rate": 5.720371871692484e-07, + "loss": 0.5599, + "mean_token_accuracy": 0.8838074952363968, + "num_tokens": 371642326.0, + "step": 3475 + }, + { + "epoch": 7.919042189281642, + "grad_norm": 3.0625, + "learning_rate": 5.708387102673507e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8860156238079071, + "num_tokens": 371749387.0, + "step": 3476 + }, + { + "epoch": 7.921322690992018, + "grad_norm": 3.109375, + "learning_rate": 5.696413283201571e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.8818910270929337, + "num_tokens": 371856387.0, + "step": 3477 + }, + { + "epoch": 7.923603192702394, + "grad_norm": 2.75, + "learning_rate": 5.684450420072792e-07, + "loss": 0.5595, + "mean_token_accuracy": 0.8838623762130737, + "num_tokens": 371963635.0, + "step": 3478 + }, + { + "epoch": 7.925883694412771, + "grad_norm": 2.78125, + "learning_rate": 5.67249852007705e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.8921998292207718, + "num_tokens": 372071179.0, + "step": 3479 + }, + { + "epoch": 7.928164196123147, + "grad_norm": 3.765625, + "learning_rate": 5.660557589998014e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.88170425593853, + "num_tokens": 372178998.0, + "step": 3480 + }, + { + "epoch": 7.930444697833523, + "grad_norm": 4.84375, + "learning_rate": 5.648627636613127e-07, + "loss": 0.5834, + "mean_token_accuracy": 0.8786900341510773, + "num_tokens": 372286187.0, + "step": 3481 + }, + { + "epoch": 7.932725199543899, + "grad_norm": 3.78125, + "learning_rate": 5.636708666693599e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.887066513299942, + "num_tokens": 372393703.0, + "step": 3482 + }, + { + "epoch": 7.935005701254276, + "grad_norm": 2.8125, + "learning_rate": 5.62480068700442e-07, + "loss": 0.5779, + "mean_token_accuracy": 0.879653736948967, + "num_tokens": 372500635.0, + "step": 3483 + }, + { + "epoch": 7.9372862029646525, + "grad_norm": 3.03125, + "learning_rate": 5.612903704304309e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8889681696891785, + "num_tokens": 372608260.0, + "step": 3484 + }, + { + "epoch": 7.939566704675029, + "grad_norm": 3.984375, + "learning_rate": 5.601017725345772e-07, + "loss": 0.5619, + "mean_token_accuracy": 0.8840508311986923, + "num_tokens": 372715730.0, + "step": 3485 + }, + { + "epoch": 7.941847206385405, + "grad_norm": 2.765625, + "learning_rate": 5.589142756875065e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.8822902888059616, + "num_tokens": 372822393.0, + "step": 3486 + }, + { + "epoch": 7.944127708095781, + "grad_norm": 3.6875, + "learning_rate": 5.577278805632186e-07, + "loss": 0.5717, + "mean_token_accuracy": 0.8777175396680832, + "num_tokens": 372929636.0, + "step": 3487 + }, + { + "epoch": 7.946408209806157, + "grad_norm": 2.6875, + "learning_rate": 5.565425878350895e-07, + "loss": 0.581, + "mean_token_accuracy": 0.8786440938711166, + "num_tokens": 373036503.0, + "step": 3488 + }, + { + "epoch": 7.9486887115165334, + "grad_norm": 2.34375, + "learning_rate": 5.553583981758668e-07, + "loss": 0.5385, + "mean_token_accuracy": 0.8891723304986954, + "num_tokens": 373143771.0, + "step": 3489 + }, + { + "epoch": 7.95096921322691, + "grad_norm": 2.625, + "learning_rate": 5.541753122576746e-07, + "loss": 0.5589, + "mean_token_accuracy": 0.8819065690040588, + "num_tokens": 373250701.0, + "step": 3490 + }, + { + "epoch": 7.953249714937286, + "grad_norm": 3.0625, + "learning_rate": 5.529933307520102e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8800751566886902, + "num_tokens": 373357804.0, + "step": 3491 + }, + { + "epoch": 7.955530216647663, + "grad_norm": 2.75, + "learning_rate": 5.518124543297423e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.8874002993106842, + "num_tokens": 373464625.0, + "step": 3492 + }, + { + "epoch": 7.957810718358039, + "grad_norm": 4.09375, + "learning_rate": 5.506326836611139e-07, + "loss": 0.5783, + "mean_token_accuracy": 0.8810268938541412, + "num_tokens": 373572124.0, + "step": 3493 + }, + { + "epoch": 7.960091220068415, + "grad_norm": 3.734375, + "learning_rate": 5.494540194157411e-07, + "loss": 0.5765, + "mean_token_accuracy": 0.8803278654813766, + "num_tokens": 373678778.0, + "step": 3494 + }, + { + "epoch": 7.9623717217787915, + "grad_norm": 2.90625, + "learning_rate": 5.482764622626094e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8871753364801407, + "num_tokens": 373785455.0, + "step": 3495 + }, + { + "epoch": 7.964652223489168, + "grad_norm": 2.890625, + "learning_rate": 5.471000128700784e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.8817069977521896, + "num_tokens": 373892856.0, + "step": 3496 + }, + { + "epoch": 7.966932725199544, + "grad_norm": 3.265625, + "learning_rate": 5.459246719058778e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8817310035228729, + "num_tokens": 374000078.0, + "step": 3497 + }, + { + "epoch": 7.96921322690992, + "grad_norm": 3.15625, + "learning_rate": 5.447504400371084e-07, + "loss": 0.5618, + "mean_token_accuracy": 0.8855457454919815, + "num_tokens": 374107031.0, + "step": 3498 + }, + { + "epoch": 7.971493728620296, + "grad_norm": 3.40625, + "learning_rate": 5.435773179302426e-07, + "loss": 0.5873, + "mean_token_accuracy": 0.8799401372671127, + "num_tokens": 374213605.0, + "step": 3499 + }, + { + "epoch": 7.9737742303306725, + "grad_norm": 2.75, + "learning_rate": 5.4240530625112e-07, + "loss": 0.5501, + "mean_token_accuracy": 0.885622963309288, + "num_tokens": 374321608.0, + "step": 3500 + }, + { + "epoch": 7.976054732041049, + "grad_norm": 3.09375, + "learning_rate": 5.412344056649527e-07, + "loss": 0.5896, + "mean_token_accuracy": 0.8806732445955276, + "num_tokens": 374427692.0, + "step": 3501 + }, + { + "epoch": 7.978335233751425, + "grad_norm": 3.234375, + "learning_rate": 5.400646168363216e-07, + "loss": 0.5627, + "mean_token_accuracy": 0.885017529129982, + "num_tokens": 374534923.0, + "step": 3502 + }, + { + "epoch": 7.980615735461802, + "grad_norm": 4.21875, + "learning_rate": 5.388959404291757e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8873688131570816, + "num_tokens": 374641673.0, + "step": 3503 + }, + { + "epoch": 7.982896237172178, + "grad_norm": 3.328125, + "learning_rate": 5.377283771068342e-07, + "loss": 0.5787, + "mean_token_accuracy": 0.8803865015506744, + "num_tokens": 374748674.0, + "step": 3504 + }, + { + "epoch": 7.985176738882554, + "grad_norm": 2.90625, + "learning_rate": 5.365619275319823e-07, + "loss": 0.585, + "mean_token_accuracy": 0.879953607916832, + "num_tokens": 374855343.0, + "step": 3505 + }, + { + "epoch": 7.9874572405929305, + "grad_norm": 3.15625, + "learning_rate": 5.353965923666743e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.8788940012454987, + "num_tokens": 374962436.0, + "step": 3506 + }, + { + "epoch": 7.989737742303307, + "grad_norm": 2.75, + "learning_rate": 5.342323722723324e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8827593922615051, + "num_tokens": 375068957.0, + "step": 3507 + }, + { + "epoch": 7.992018244013683, + "grad_norm": 3.125, + "learning_rate": 5.330692679097457e-07, + "loss": 0.5441, + "mean_token_accuracy": 0.8856361508369446, + "num_tokens": 375176504.0, + "step": 3508 + }, + { + "epoch": 7.994298745724059, + "grad_norm": 3.78125, + "learning_rate": 5.319072799390693e-07, + "loss": 0.5954, + "mean_token_accuracy": 0.8784863501787186, + "num_tokens": 375283610.0, + "step": 3509 + }, + { + "epoch": 7.996579247434435, + "grad_norm": 3.796875, + "learning_rate": 5.307464090198258e-07, + "loss": 0.5606, + "mean_token_accuracy": 0.8854129165410995, + "num_tokens": 375390831.0, + "step": 3510 + }, + { + "epoch": 7.9988597491448115, + "grad_norm": 3.53125, + "learning_rate": 5.295866558109023e-07, + "loss": 0.5461, + "mean_token_accuracy": 0.8877543658018112, + "num_tokens": 375498660.0, + "step": 3511 + }, + { + "epoch": 8.0, + "grad_norm": 5.6875, + "learning_rate": 5.284280209705531e-07, + "loss": 0.5913, + "mean_token_accuracy": 0.8680516481399536, + "num_tokens": 375537856.0, + "step": 3512 + }, + { + "epoch": 8.002280501710377, + "grad_norm": 3.078125, + "learning_rate": 5.272705051563959e-07, + "loss": 0.5911, + "mean_token_accuracy": 0.8796858638525009, + "num_tokens": 375645015.0, + "step": 3513 + }, + { + "epoch": 8.004561003420752, + "grad_norm": 2.78125, + "learning_rate": 5.261141090254149e-07, + "loss": 0.57, + "mean_token_accuracy": 0.880715399980545, + "num_tokens": 375752133.0, + "step": 3514 + }, + { + "epoch": 8.00684150513113, + "grad_norm": 2.8125, + "learning_rate": 5.249588332339589e-07, + "loss": 0.5635, + "mean_token_accuracy": 0.8854578584432602, + "num_tokens": 375859485.0, + "step": 3515 + }, + { + "epoch": 8.009122006841505, + "grad_norm": 3.09375, + "learning_rate": 5.238046784377388e-07, + "loss": 0.5818, + "mean_token_accuracy": 0.8783604055643082, + "num_tokens": 375965833.0, + "step": 3516 + }, + { + "epoch": 8.011402508551882, + "grad_norm": 2.90625, + "learning_rate": 5.226516452918315e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.8814006298780441, + "num_tokens": 376072854.0, + "step": 3517 + }, + { + "epoch": 8.013683010262257, + "grad_norm": 2.84375, + "learning_rate": 5.214997344506758e-07, + "loss": 0.5739, + "mean_token_accuracy": 0.884096547961235, + "num_tokens": 376179522.0, + "step": 3518 + }, + { + "epoch": 8.015963511972634, + "grad_norm": 3.140625, + "learning_rate": 5.203489465680747e-07, + "loss": 0.5921, + "mean_token_accuracy": 0.8821551650762558, + "num_tokens": 376286201.0, + "step": 3519 + }, + { + "epoch": 8.01824401368301, + "grad_norm": 3.234375, + "learning_rate": 5.19199282297193e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8864544034004211, + "num_tokens": 376394233.0, + "step": 3520 + }, + { + "epoch": 8.01824401368301, + "eval_loss": 0.5862906575202942, + "eval_mean_token_accuracy": 0.8799242350085154, + "eval_num_tokens": 376394233.0, + "eval_runtime": 58.678, + "eval_samples_per_second": 142.899, + "eval_steps_per_second": 4.482, + "step": 3520 + }, + { + "epoch": 8.020524515393387, + "grad_norm": 2.34375, + "learning_rate": 5.180507422905585e-07, + "loss": 0.57, + "mean_token_accuracy": 0.8834462761878967, + "num_tokens": 376501043.0, + "step": 3521 + }, + { + "epoch": 8.022805017103764, + "grad_norm": 2.78125, + "learning_rate": 5.169033272000587e-07, + "loss": 0.5796, + "mean_token_accuracy": 0.8795521408319473, + "num_tokens": 376607841.0, + "step": 3522 + }, + { + "epoch": 8.025085518814139, + "grad_norm": 2.765625, + "learning_rate": 5.157570376769452e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8829406499862671, + "num_tokens": 376714615.0, + "step": 3523 + }, + { + "epoch": 8.027366020524516, + "grad_norm": 2.796875, + "learning_rate": 5.146118743718301e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.885052427649498, + "num_tokens": 376822883.0, + "step": 3524 + }, + { + "epoch": 8.029646522234891, + "grad_norm": 3.390625, + "learning_rate": 5.134678379346856e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.8828334510326385, + "num_tokens": 376929625.0, + "step": 3525 + }, + { + "epoch": 8.031927023945268, + "grad_norm": 3.796875, + "learning_rate": 5.123249290148452e-07, + "loss": 0.5781, + "mean_token_accuracy": 0.8789701759815216, + "num_tokens": 377036782.0, + "step": 3526 + }, + { + "epoch": 8.034207525655644, + "grad_norm": 3.328125, + "learning_rate": 5.111831482610011e-07, + "loss": 0.5675, + "mean_token_accuracy": 0.8839947730302811, + "num_tokens": 377143972.0, + "step": 3527 + }, + { + "epoch": 8.03648802736602, + "grad_norm": 3.203125, + "learning_rate": 5.100424963212064e-07, + "loss": 0.5748, + "mean_token_accuracy": 0.8813960254192352, + "num_tokens": 377250375.0, + "step": 3528 + }, + { + "epoch": 8.038768529076396, + "grad_norm": 2.46875, + "learning_rate": 5.089029738428733e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8840717822313309, + "num_tokens": 377357434.0, + "step": 3529 + }, + { + "epoch": 8.041049030786773, + "grad_norm": 3.90625, + "learning_rate": 5.077645814727725e-07, + "loss": 0.5816, + "mean_token_accuracy": 0.8795655369758606, + "num_tokens": 377464466.0, + "step": 3530 + }, + { + "epoch": 8.043329532497149, + "grad_norm": 3.53125, + "learning_rate": 5.066273198570343e-07, + "loss": 0.5863, + "mean_token_accuracy": 0.8808377087116241, + "num_tokens": 377570549.0, + "step": 3531 + }, + { + "epoch": 8.045610034207526, + "grad_norm": 4.09375, + "learning_rate": 5.054911896411452e-07, + "loss": 0.5662, + "mean_token_accuracy": 0.8835866749286652, + "num_tokens": 377678050.0, + "step": 3532 + }, + { + "epoch": 8.047890535917903, + "grad_norm": 2.65625, + "learning_rate": 5.043561914699513e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.8832181841135025, + "num_tokens": 377785329.0, + "step": 3533 + }, + { + "epoch": 8.050171037628278, + "grad_norm": 3.609375, + "learning_rate": 5.032223259876565e-07, + "loss": 0.5881, + "mean_token_accuracy": 0.8773612827062607, + "num_tokens": 377892051.0, + "step": 3534 + }, + { + "epoch": 8.052451539338655, + "grad_norm": 3.71875, + "learning_rate": 5.020895938378195e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.8818919360637665, + "num_tokens": 377999467.0, + "step": 3535 + }, + { + "epoch": 8.05473204104903, + "grad_norm": 2.6875, + "learning_rate": 5.009579956633578e-07, + "loss": 0.5568, + "mean_token_accuracy": 0.884328693151474, + "num_tokens": 378106965.0, + "step": 3536 + }, + { + "epoch": 8.057012542759407, + "grad_norm": 3.46875, + "learning_rate": 4.998275321065454e-07, + "loss": 0.5704, + "mean_token_accuracy": 0.8801892846822739, + "num_tokens": 378214261.0, + "step": 3537 + }, + { + "epoch": 8.059293044469783, + "grad_norm": 3.5, + "learning_rate": 4.986982038090104e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8822603821754456, + "num_tokens": 378320449.0, + "step": 3538 + }, + { + "epoch": 8.06157354618016, + "grad_norm": 2.921875, + "learning_rate": 4.975700114117385e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8873136639595032, + "num_tokens": 378427513.0, + "step": 3539 + }, + { + "epoch": 8.063854047890535, + "grad_norm": 2.875, + "learning_rate": 4.964429555550693e-07, + "loss": 0.5841, + "mean_token_accuracy": 0.8837717622518539, + "num_tokens": 378534328.0, + "step": 3540 + }, + { + "epoch": 8.066134549600912, + "grad_norm": 3.296875, + "learning_rate": 4.953170368786985e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.8866355121135712, + "num_tokens": 378641467.0, + "step": 3541 + }, + { + "epoch": 8.06841505131129, + "grad_norm": 2.90625, + "learning_rate": 4.941922560216764e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8841693699359894, + "num_tokens": 378748281.0, + "step": 3542 + }, + { + "epoch": 8.070695553021665, + "grad_norm": 3.125, + "learning_rate": 4.930686136224056e-07, + "loss": 0.5686, + "mean_token_accuracy": 0.8798070102930069, + "num_tokens": 378854984.0, + "step": 3543 + }, + { + "epoch": 8.072976054732042, + "grad_norm": 4.90625, + "learning_rate": 4.91946110318644e-07, + "loss": 0.5871, + "mean_token_accuracy": 0.8762340992689133, + "num_tokens": 378962257.0, + "step": 3544 + }, + { + "epoch": 8.075256556442417, + "grad_norm": 2.390625, + "learning_rate": 4.908247467475036e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.8846005648374557, + "num_tokens": 379069977.0, + "step": 3545 + }, + { + "epoch": 8.077537058152794, + "grad_norm": 3.828125, + "learning_rate": 4.897045235454481e-07, + "loss": 0.5755, + "mean_token_accuracy": 0.882202997803688, + "num_tokens": 379177064.0, + "step": 3546 + }, + { + "epoch": 8.07981755986317, + "grad_norm": 5.0, + "learning_rate": 4.885854413482955e-07, + "loss": 0.5663, + "mean_token_accuracy": 0.8858351409435272, + "num_tokens": 379284331.0, + "step": 3547 + }, + { + "epoch": 8.082098061573546, + "grad_norm": 3.921875, + "learning_rate": 4.874675007912138e-07, + "loss": 0.56, + "mean_token_accuracy": 0.8848386406898499, + "num_tokens": 379391861.0, + "step": 3548 + }, + { + "epoch": 8.084378563283922, + "grad_norm": 4.96875, + "learning_rate": 4.863507025087255e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.8858330547809601, + "num_tokens": 379499126.0, + "step": 3549 + }, + { + "epoch": 8.086659064994299, + "grad_norm": 3.015625, + "learning_rate": 4.852350471347031e-07, + "loss": 0.5846, + "mean_token_accuracy": 0.8791882544755936, + "num_tokens": 379605713.0, + "step": 3550 + }, + { + "epoch": 8.088939566704674, + "grad_norm": 3.109375, + "learning_rate": 4.841205353023715e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.881306603550911, + "num_tokens": 379712727.0, + "step": 3551 + }, + { + "epoch": 8.091220068415051, + "grad_norm": 2.875, + "learning_rate": 4.83007167644306e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.8896623253822327, + "num_tokens": 379819603.0, + "step": 3552 + }, + { + "epoch": 8.093500570125428, + "grad_norm": 3.796875, + "learning_rate": 4.818949447924334e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.881472259759903, + "num_tokens": 379926447.0, + "step": 3553 + }, + { + "epoch": 8.095781071835804, + "grad_norm": 2.59375, + "learning_rate": 4.807838673780282e-07, + "loss": 0.5636, + "mean_token_accuracy": 0.8816197216510773, + "num_tokens": 380033718.0, + "step": 3554 + }, + { + "epoch": 8.09806157354618, + "grad_norm": 2.96875, + "learning_rate": 4.796739360317181e-07, + "loss": 0.56, + "mean_token_accuracy": 0.8843680173158646, + "num_tokens": 380140392.0, + "step": 3555 + }, + { + "epoch": 8.100342075256556, + "grad_norm": 2.8125, + "learning_rate": 4.785651513834774e-07, + "loss": 0.5553, + "mean_token_accuracy": 0.8834531456232071, + "num_tokens": 380247239.0, + "step": 3556 + }, + { + "epoch": 8.102622576966933, + "grad_norm": 4.78125, + "learning_rate": 4.774575140626317e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.8883015066385269, + "num_tokens": 380354451.0, + "step": 3557 + }, + { + "epoch": 8.104903078677308, + "grad_norm": 4.125, + "learning_rate": 4.763510246978548e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8845552057027817, + "num_tokens": 380462073.0, + "step": 3558 + }, + { + "epoch": 8.107183580387685, + "grad_norm": 4.71875, + "learning_rate": 4.7524568391716736e-07, + "loss": 0.5809, + "mean_token_accuracy": 0.8840623497962952, + "num_tokens": 380569477.0, + "step": 3559 + }, + { + "epoch": 8.10946408209806, + "grad_norm": 3.21875, + "learning_rate": 4.7414149234794064e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8819562494754791, + "num_tokens": 380676862.0, + "step": 3560 + }, + { + "epoch": 8.111744583808438, + "grad_norm": 3.8125, + "learning_rate": 4.7303845061689197e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.8820130825042725, + "num_tokens": 380783938.0, + "step": 3561 + }, + { + "epoch": 8.114025085518815, + "grad_norm": 3.1875, + "learning_rate": 4.719365593500866e-07, + "loss": 0.5504, + "mean_token_accuracy": 0.8882433474063873, + "num_tokens": 380891146.0, + "step": 3562 + }, + { + "epoch": 8.11630558722919, + "grad_norm": 3.296875, + "learning_rate": 4.7083581917293784e-07, + "loss": 0.5798, + "mean_token_accuracy": 0.8783487379550934, + "num_tokens": 380998117.0, + "step": 3563 + }, + { + "epoch": 8.118586088939567, + "grad_norm": 2.59375, + "learning_rate": 4.6973623071020267e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8855432868003845, + "num_tokens": 381104807.0, + "step": 3564 + }, + { + "epoch": 8.120866590649943, + "grad_norm": 2.75, + "learning_rate": 4.686377945859874e-07, + "loss": 0.5585, + "mean_token_accuracy": 0.8837299197912216, + "num_tokens": 381212129.0, + "step": 3565 + }, + { + "epoch": 8.12314709236032, + "grad_norm": 4.375, + "learning_rate": 4.6754051142374275e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.8824782967567444, + "num_tokens": 381319310.0, + "step": 3566 + }, + { + "epoch": 8.125427594070695, + "grad_norm": 3.375, + "learning_rate": 4.664443818462658e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8837754726409912, + "num_tokens": 381426819.0, + "step": 3567 + }, + { + "epoch": 8.127708095781072, + "grad_norm": 4.40625, + "learning_rate": 4.653494064756983e-07, + "loss": 0.5864, + "mean_token_accuracy": 0.882844015955925, + "num_tokens": 381533803.0, + "step": 3568 + }, + { + "epoch": 8.129988597491447, + "grad_norm": 2.71875, + "learning_rate": 4.6425558593352796e-07, + "loss": 0.5675, + "mean_token_accuracy": 0.8839477002620697, + "num_tokens": 381640832.0, + "step": 3569 + }, + { + "epoch": 8.132269099201825, + "grad_norm": 4.125, + "learning_rate": 4.631629208405847e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8868978321552277, + "num_tokens": 381748214.0, + "step": 3570 + }, + { + "epoch": 8.134549600912202, + "grad_norm": 3.359375, + "learning_rate": 4.620714118170452e-07, + "loss": 0.5508, + "mean_token_accuracy": 0.8868376761674881, + "num_tokens": 381855821.0, + "step": 3571 + }, + { + "epoch": 8.136830102622577, + "grad_norm": 3.09375, + "learning_rate": 4.609810594824282e-07, + "loss": 0.5761, + "mean_token_accuracy": 0.883231908082962, + "num_tokens": 381962902.0, + "step": 3572 + }, + { + "epoch": 8.139110604332954, + "grad_norm": 2.796875, + "learning_rate": 4.598918644555975e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.8844647407531738, + "num_tokens": 382070254.0, + "step": 3573 + }, + { + "epoch": 8.14139110604333, + "grad_norm": 2.828125, + "learning_rate": 4.58803827354759e-07, + "loss": 0.5937, + "mean_token_accuracy": 0.8791768401861191, + "num_tokens": 382177004.0, + "step": 3574 + }, + { + "epoch": 8.143671607753706, + "grad_norm": 3.625, + "learning_rate": 4.5771694879746087e-07, + "loss": 0.5942, + "mean_token_accuracy": 0.874402716755867, + "num_tokens": 382283679.0, + "step": 3575 + }, + { + "epoch": 8.145952109464082, + "grad_norm": 4.4375, + "learning_rate": 4.566312294005948e-07, + "loss": 0.5765, + "mean_token_accuracy": 0.8818236142396927, + "num_tokens": 382390417.0, + "step": 3576 + }, + { + "epoch": 8.148232611174459, + "grad_norm": 3.03125, + "learning_rate": 4.5554666978039455e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8824535757303238, + "num_tokens": 382497498.0, + "step": 3577 + }, + { + "epoch": 8.150513112884834, + "grad_norm": 3.90625, + "learning_rate": 4.544632705524343e-07, + "loss": 0.5755, + "mean_token_accuracy": 0.8810164928436279, + "num_tokens": 382604536.0, + "step": 3578 + }, + { + "epoch": 8.152793614595211, + "grad_norm": 3.328125, + "learning_rate": 4.5338103233163175e-07, + "loss": 0.5817, + "mean_token_accuracy": 0.8840733021497726, + "num_tokens": 382711132.0, + "step": 3579 + }, + { + "epoch": 8.155074116305586, + "grad_norm": 2.375, + "learning_rate": 4.522999557322433e-07, + "loss": 0.574, + "mean_token_accuracy": 0.8804437518119812, + "num_tokens": 382817671.0, + "step": 3580 + }, + { + "epoch": 8.157354618015964, + "grad_norm": 3.421875, + "learning_rate": 4.512200413678672e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8838673382997513, + "num_tokens": 382924893.0, + "step": 3581 + }, + { + "epoch": 8.15963511972634, + "grad_norm": 3.0, + "learning_rate": 4.501412898514426e-07, + "loss": 0.5934, + "mean_token_accuracy": 0.8765487670898438, + "num_tokens": 383031251.0, + "step": 3582 + }, + { + "epoch": 8.161915621436716, + "grad_norm": 2.40625, + "learning_rate": 4.490637017952479e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8857912421226501, + "num_tokens": 383138617.0, + "step": 3583 + }, + { + "epoch": 8.164196123147093, + "grad_norm": 3.421875, + "learning_rate": 4.4798727781090096e-07, + "loss": 0.5761, + "mean_token_accuracy": 0.883015900850296, + "num_tokens": 383245038.0, + "step": 3584 + }, + { + "epoch": 8.166476624857468, + "grad_norm": 3.109375, + "learning_rate": 4.4691201850936034e-07, + "loss": 0.5796, + "mean_token_accuracy": 0.8790012896060944, + "num_tokens": 383351713.0, + "step": 3585 + }, + { + "epoch": 8.168757126567845, + "grad_norm": 3.578125, + "learning_rate": 4.458379245009209e-07, + "loss": 0.5598, + "mean_token_accuracy": 0.8837389200925827, + "num_tokens": 383458654.0, + "step": 3586 + }, + { + "epoch": 8.17103762827822, + "grad_norm": 3.3125, + "learning_rate": 4.447649963952183e-07, + "loss": 0.584, + "mean_token_accuracy": 0.8778487145900726, + "num_tokens": 383565763.0, + "step": 3587 + }, + { + "epoch": 8.173318129988598, + "grad_norm": 2.65625, + "learning_rate": 4.43693234801226e-07, + "loss": 0.5907, + "mean_token_accuracy": 0.8766829818487167, + "num_tokens": 383672820.0, + "step": 3588 + }, + { + "epoch": 8.175598631698973, + "grad_norm": 5.4375, + "learning_rate": 4.4262264032725517e-07, + "loss": 0.5938, + "mean_token_accuracy": 0.8824921548366547, + "num_tokens": 383780010.0, + "step": 3589 + }, + { + "epoch": 8.17787913340935, + "grad_norm": 3.125, + "learning_rate": 4.41553213580955e-07, + "loss": 0.5849, + "mean_token_accuracy": 0.8807243406772614, + "num_tokens": 383886905.0, + "step": 3590 + }, + { + "epoch": 8.180159635119727, + "grad_norm": 2.703125, + "learning_rate": 4.404849551693102e-07, + "loss": 0.5775, + "mean_token_accuracy": 0.8819932341575623, + "num_tokens": 383994254.0, + "step": 3591 + }, + { + "epoch": 8.182440136830103, + "grad_norm": 2.59375, + "learning_rate": 4.394178656986448e-07, + "loss": 0.5586, + "mean_token_accuracy": 0.881662055850029, + "num_tokens": 384101925.0, + "step": 3592 + }, + { + "epoch": 8.18472063854048, + "grad_norm": 3.046875, + "learning_rate": 4.383519457746174e-07, + "loss": 0.5645, + "mean_token_accuracy": 0.8843112587928772, + "num_tokens": 384209051.0, + "step": 3593 + }, + { + "epoch": 8.187001140250855, + "grad_norm": 2.8125, + "learning_rate": 4.3728719600222374e-07, + "loss": 0.5566, + "mean_token_accuracy": 0.8843548893928528, + "num_tokens": 384316445.0, + "step": 3594 + }, + { + "epoch": 8.189281641961232, + "grad_norm": 2.875, + "learning_rate": 4.3622361698579586e-07, + "loss": 0.5965, + "mean_token_accuracy": 0.8787325769662857, + "num_tokens": 384422950.0, + "step": 3595 + }, + { + "epoch": 8.191562143671607, + "grad_norm": 2.921875, + "learning_rate": 4.351612093290006e-07, + "loss": 0.5696, + "mean_token_accuracy": 0.8826261758804321, + "num_tokens": 384530108.0, + "step": 3596 + }, + { + "epoch": 8.193842645381984, + "grad_norm": 3.171875, + "learning_rate": 4.340999736348389e-07, + "loss": 0.5549, + "mean_token_accuracy": 0.8850667327642441, + "num_tokens": 384637297.0, + "step": 3597 + }, + { + "epoch": 8.19612314709236, + "grad_norm": 3.15625, + "learning_rate": 4.3303991050564877e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.8847819566726685, + "num_tokens": 384744448.0, + "step": 3598 + }, + { + "epoch": 8.198403648802737, + "grad_norm": 2.875, + "learning_rate": 4.3198102054310157e-07, + "loss": 0.5707, + "mean_token_accuracy": 0.8816685527563095, + "num_tokens": 384851534.0, + "step": 3599 + }, + { + "epoch": 8.200684150513112, + "grad_norm": 4.25, + "learning_rate": 4.30923304348202e-07, + "loss": 0.5673, + "mean_token_accuracy": 0.8833976686000824, + "num_tokens": 384958686.0, + "step": 3600 + }, + { + "epoch": 8.20296465222349, + "grad_norm": 2.90625, + "learning_rate": 4.2986676252129047e-07, + "loss": 0.58, + "mean_token_accuracy": 0.8791931867599487, + "num_tokens": 385065535.0, + "step": 3601 + }, + { + "epoch": 8.205245153933866, + "grad_norm": 3.1875, + "learning_rate": 4.288113956620382e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.8821399062871933, + "num_tokens": 385172528.0, + "step": 3602 + }, + { + "epoch": 8.207525655644242, + "grad_norm": 2.828125, + "learning_rate": 4.2775720436945225e-07, + "loss": 0.5577, + "mean_token_accuracy": 0.8869880139827728, + "num_tokens": 385279822.0, + "step": 3603 + }, + { + "epoch": 8.209806157354619, + "grad_norm": 2.984375, + "learning_rate": 4.267041892418705e-07, + "loss": 0.5724, + "mean_token_accuracy": 0.8820626139640808, + "num_tokens": 385387057.0, + "step": 3604 + }, + { + "epoch": 8.212086659064994, + "grad_norm": 2.703125, + "learning_rate": 4.256523508769647e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.8827876895666122, + "num_tokens": 385494227.0, + "step": 3605 + }, + { + "epoch": 8.214367160775371, + "grad_norm": 3.28125, + "learning_rate": 4.246016898717381e-07, + "loss": 0.5853, + "mean_token_accuracy": 0.8795175403356552, + "num_tokens": 385600675.0, + "step": 3606 + }, + { + "epoch": 8.216647662485746, + "grad_norm": 3.203125, + "learning_rate": 4.235522068225248e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8818391412496567, + "num_tokens": 385707591.0, + "step": 3607 + }, + { + "epoch": 8.218928164196123, + "grad_norm": 3.015625, + "learning_rate": 4.225039023249916e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.8863593190908432, + "num_tokens": 385814554.0, + "step": 3608 + }, + { + "epoch": 8.221208665906499, + "grad_norm": 3.0, + "learning_rate": 4.2145677697413566e-07, + "loss": 0.5972, + "mean_token_accuracy": 0.8759454041719437, + "num_tokens": 385921328.0, + "step": 3609 + }, + { + "epoch": 8.223489167616876, + "grad_norm": 4.3125, + "learning_rate": 4.204108313642852e-07, + "loss": 0.5589, + "mean_token_accuracy": 0.8848637193441391, + "num_tokens": 386028022.0, + "step": 3610 + }, + { + "epoch": 8.225769669327253, + "grad_norm": 3.890625, + "learning_rate": 4.1936606608909887e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.883062332868576, + "num_tokens": 386135191.0, + "step": 3611 + }, + { + "epoch": 8.228050171037628, + "grad_norm": 3.078125, + "learning_rate": 4.1832248174156597e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.885178342461586, + "num_tokens": 386242119.0, + "step": 3612 + }, + { + "epoch": 8.230330672748005, + "grad_norm": 3.234375, + "learning_rate": 4.1728007891400356e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.8812027722597122, + "num_tokens": 386349161.0, + "step": 3613 + }, + { + "epoch": 8.23261117445838, + "grad_norm": 3.328125, + "learning_rate": 4.1623885819805977e-07, + "loss": 0.5699, + "mean_token_accuracy": 0.8800331354141235, + "num_tokens": 386456594.0, + "step": 3614 + }, + { + "epoch": 8.234891676168758, + "grad_norm": 3.515625, + "learning_rate": 4.151988201847112e-07, + "loss": 0.5672, + "mean_token_accuracy": 0.884452149271965, + "num_tokens": 386563773.0, + "step": 3615 + }, + { + "epoch": 8.237172177879133, + "grad_norm": 3.359375, + "learning_rate": 4.141599654642642e-07, + "loss": 0.5577, + "mean_token_accuracy": 0.8853688091039658, + "num_tokens": 386670901.0, + "step": 3616 + }, + { + "epoch": 8.23945267958951, + "grad_norm": 2.78125, + "learning_rate": 4.1312229462635243e-07, + "loss": 0.5623, + "mean_token_accuracy": 0.8857151865959167, + "num_tokens": 386777914.0, + "step": 3617 + }, + { + "epoch": 8.241733181299885, + "grad_norm": 3.296875, + "learning_rate": 4.1208580825993686e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.8840577751398087, + "num_tokens": 386884846.0, + "step": 3618 + }, + { + "epoch": 8.244013683010262, + "grad_norm": 3.15625, + "learning_rate": 4.1105050695330774e-07, + "loss": 0.5654, + "mean_token_accuracy": 0.8825150728225708, + "num_tokens": 386992013.0, + "step": 3619 + }, + { + "epoch": 8.246294184720638, + "grad_norm": 3.953125, + "learning_rate": 4.100163912940827e-07, + "loss": 0.5946, + "mean_token_accuracy": 0.8755682855844498, + "num_tokens": 387098665.0, + "step": 3620 + }, + { + "epoch": 8.248574686431015, + "grad_norm": 2.515625, + "learning_rate": 4.0898346186920484e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.8821796476840973, + "num_tokens": 387205278.0, + "step": 3621 + }, + { + "epoch": 8.250855188141392, + "grad_norm": 5.0625, + "learning_rate": 4.0795171926494543e-07, + "loss": 0.5953, + "mean_token_accuracy": 0.8782723695039749, + "num_tokens": 387312537.0, + "step": 3622 + }, + { + "epoch": 8.253135689851767, + "grad_norm": 3.25, + "learning_rate": 4.0692116406690214e-07, + "loss": 0.5792, + "mean_token_accuracy": 0.8817024827003479, + "num_tokens": 387419037.0, + "step": 3623 + }, + { + "epoch": 8.255416191562144, + "grad_norm": 4.84375, + "learning_rate": 4.058917968599968e-07, + "loss": 0.5844, + "mean_token_accuracy": 0.8763107359409332, + "num_tokens": 387525971.0, + "step": 3624 + }, + { + "epoch": 8.25769669327252, + "grad_norm": 3.6875, + "learning_rate": 4.048636182284796e-07, + "loss": 0.5607, + "mean_token_accuracy": 0.8834527432918549, + "num_tokens": 387633084.0, + "step": 3625 + }, + { + "epoch": 8.259977194982897, + "grad_norm": 2.84375, + "learning_rate": 4.038366287559245e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8804467916488647, + "num_tokens": 387740264.0, + "step": 3626 + }, + { + "epoch": 8.262257696693272, + "grad_norm": 3.453125, + "learning_rate": 4.0281082902523055e-07, + "loss": 0.5722, + "mean_token_accuracy": 0.8822118043899536, + "num_tokens": 387847552.0, + "step": 3627 + }, + { + "epoch": 8.264538198403649, + "grad_norm": 2.875, + "learning_rate": 4.0178621961862315e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8843962848186493, + "num_tokens": 387954393.0, + "step": 3628 + }, + { + "epoch": 8.266818700114024, + "grad_norm": 3.671875, + "learning_rate": 4.0076280111764927e-07, + "loss": 0.5857, + "mean_token_accuracy": 0.8815756440162659, + "num_tokens": 388061243.0, + "step": 3629 + }, + { + "epoch": 8.269099201824401, + "grad_norm": 3.328125, + "learning_rate": 3.997405741031821e-07, + "loss": 0.5575, + "mean_token_accuracy": 0.885826587677002, + "num_tokens": 388168371.0, + "step": 3630 + }, + { + "epoch": 8.271379703534778, + "grad_norm": 2.890625, + "learning_rate": 3.98719539155418e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.8817395716905594, + "num_tokens": 388276038.0, + "step": 3631 + }, + { + "epoch": 8.273660205245154, + "grad_norm": 2.890625, + "learning_rate": 3.9769969685387684e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.8838264048099518, + "num_tokens": 388382766.0, + "step": 3632 + }, + { + "epoch": 8.27594070695553, + "grad_norm": 3.625, + "learning_rate": 3.966810477774016e-07, + "loss": 0.5852, + "mean_token_accuracy": 0.8818796724081039, + "num_tokens": 388489939.0, + "step": 3633 + }, + { + "epoch": 8.278221208665906, + "grad_norm": 3.84375, + "learning_rate": 3.9566359250415686e-07, + "loss": 0.5851, + "mean_token_accuracy": 0.8771698474884033, + "num_tokens": 388596579.0, + "step": 3634 + }, + { + "epoch": 8.280501710376283, + "grad_norm": 3.078125, + "learning_rate": 3.9464733161163144e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.881603792309761, + "num_tokens": 388703184.0, + "step": 3635 + }, + { + "epoch": 8.282782212086659, + "grad_norm": 2.828125, + "learning_rate": 3.9363226567663503e-07, + "loss": 0.5835, + "mean_token_accuracy": 0.8799589574337006, + "num_tokens": 388809921.0, + "step": 3636 + }, + { + "epoch": 8.285062713797036, + "grad_norm": 3.453125, + "learning_rate": 3.926183952752999e-07, + "loss": 0.5805, + "mean_token_accuracy": 0.8831854462623596, + "num_tokens": 388916867.0, + "step": 3637 + }, + { + "epoch": 8.287343215507411, + "grad_norm": 3.03125, + "learning_rate": 3.9160572098307923e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8826831877231598, + "num_tokens": 389024041.0, + "step": 3638 + }, + { + "epoch": 8.289623717217788, + "grad_norm": 2.859375, + "learning_rate": 3.90594243374747e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.8816553503274918, + "num_tokens": 389131535.0, + "step": 3639 + }, + { + "epoch": 8.291904218928163, + "grad_norm": 4.65625, + "learning_rate": 3.895839630243983e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8834093660116196, + "num_tokens": 389238175.0, + "step": 3640 + }, + { + "epoch": 8.29418472063854, + "grad_norm": 3.390625, + "learning_rate": 3.8857488050544903e-07, + "loss": 0.5675, + "mean_token_accuracy": 0.8817841410636902, + "num_tokens": 389344785.0, + "step": 3641 + }, + { + "epoch": 8.296465222348917, + "grad_norm": 2.625, + "learning_rate": 3.875669963906356e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8869169652462006, + "num_tokens": 389452290.0, + "step": 3642 + }, + { + "epoch": 8.298745724059293, + "grad_norm": 2.578125, + "learning_rate": 3.865603112520125e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.8858313113451004, + "num_tokens": 389559422.0, + "step": 3643 + }, + { + "epoch": 8.30102622576967, + "grad_norm": 3.125, + "learning_rate": 3.855548256609556e-07, + "loss": 0.5437, + "mean_token_accuracy": 0.8861667513847351, + "num_tokens": 389667410.0, + "step": 3644 + }, + { + "epoch": 8.303306727480045, + "grad_norm": 3.421875, + "learning_rate": 3.8455054018815803e-07, + "loss": 0.5598, + "mean_token_accuracy": 0.8834993839263916, + "num_tokens": 389774294.0, + "step": 3645 + }, + { + "epoch": 8.305587229190422, + "grad_norm": 2.890625, + "learning_rate": 3.8354745540363364e-07, + "loss": 0.5594, + "mean_token_accuracy": 0.8835310786962509, + "num_tokens": 389881962.0, + "step": 3646 + }, + { + "epoch": 8.307867730900798, + "grad_norm": 3.390625, + "learning_rate": 3.8254557187671374e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.881848931312561, + "num_tokens": 389989039.0, + "step": 3647 + }, + { + "epoch": 8.310148232611175, + "grad_norm": 3.0, + "learning_rate": 3.815448901760485e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8844071328639984, + "num_tokens": 390096310.0, + "step": 3648 + }, + { + "epoch": 8.31242873432155, + "grad_norm": 3.4375, + "learning_rate": 3.805454108696055e-07, + "loss": 0.591, + "mean_token_accuracy": 0.8806750923395157, + "num_tokens": 390202884.0, + "step": 3649 + }, + { + "epoch": 8.314709236031927, + "grad_norm": 2.578125, + "learning_rate": 3.7954713452466927e-07, + "loss": 0.5594, + "mean_token_accuracy": 0.8855821341276169, + "num_tokens": 390310246.0, + "step": 3650 + }, + { + "epoch": 8.316989737742304, + "grad_norm": 3.578125, + "learning_rate": 3.785500617078425e-07, + "loss": 0.5854, + "mean_token_accuracy": 0.8808927685022354, + "num_tokens": 390416919.0, + "step": 3651 + }, + { + "epoch": 8.31927023945268, + "grad_norm": 3.34375, + "learning_rate": 3.775541929850443e-07, + "loss": 0.5811, + "mean_token_accuracy": 0.8799949437379837, + "num_tokens": 390524810.0, + "step": 3652 + }, + { + "epoch": 8.321550741163056, + "grad_norm": 2.390625, + "learning_rate": 3.76559528921511e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8805143088102341, + "num_tokens": 390631927.0, + "step": 3653 + }, + { + "epoch": 8.323831242873432, + "grad_norm": 2.796875, + "learning_rate": 3.7556607008179454e-07, + "loss": 0.557, + "mean_token_accuracy": 0.885184645652771, + "num_tokens": 390738890.0, + "step": 3654 + }, + { + "epoch": 8.326111744583809, + "grad_norm": 4.34375, + "learning_rate": 3.745738170297633e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8800319731235504, + "num_tokens": 390845227.0, + "step": 3655 + }, + { + "epoch": 8.328392246294184, + "grad_norm": 4.28125, + "learning_rate": 3.7358277032860016e-07, + "loss": 0.5723, + "mean_token_accuracy": 0.8837781846523285, + "num_tokens": 390951990.0, + "step": 3656 + }, + { + "epoch": 8.330672748004561, + "grad_norm": 3.046875, + "learning_rate": 3.7259293054080435e-07, + "loss": 0.594, + "mean_token_accuracy": 0.8773994743824005, + "num_tokens": 391059266.0, + "step": 3657 + }, + { + "epoch": 8.332953249714937, + "grad_norm": 2.640625, + "learning_rate": 3.7160429822819003e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8825381994247437, + "num_tokens": 391166339.0, + "step": 3658 + }, + { + "epoch": 8.335233751425314, + "grad_norm": 3.21875, + "learning_rate": 3.706168739518859e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.881363645195961, + "num_tokens": 391273827.0, + "step": 3659 + }, + { + "epoch": 8.33751425313569, + "grad_norm": 4.375, + "learning_rate": 3.6963065827233524e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.8808765411376953, + "num_tokens": 391381343.0, + "step": 3660 + }, + { + "epoch": 8.339794754846066, + "grad_norm": 4.28125, + "learning_rate": 3.6864565174929393e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8825783133506775, + "num_tokens": 391488599.0, + "step": 3661 + }, + { + "epoch": 8.342075256556443, + "grad_norm": 2.9375, + "learning_rate": 3.676618549418334e-07, + "loss": 0.5825, + "mean_token_accuracy": 0.8835724592208862, + "num_tokens": 391596245.0, + "step": 3662 + }, + { + "epoch": 8.344355758266818, + "grad_norm": 3.421875, + "learning_rate": 3.666792684083381e-07, + "loss": 0.5779, + "mean_token_accuracy": 0.8827132731676102, + "num_tokens": 391702960.0, + "step": 3663 + }, + { + "epoch": 8.346636259977195, + "grad_norm": 2.96875, + "learning_rate": 3.656978927065041e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.8864104300737381, + "num_tokens": 391811197.0, + "step": 3664 + }, + { + "epoch": 8.34891676168757, + "grad_norm": 3.03125, + "learning_rate": 3.64717728393342e-07, + "loss": 0.602, + "mean_token_accuracy": 0.8749131262302399, + "num_tokens": 391918416.0, + "step": 3665 + }, + { + "epoch": 8.351197263397948, + "grad_norm": 2.875, + "learning_rate": 3.6373877602517457e-07, + "loss": 0.5567, + "mean_token_accuracy": 0.884322926402092, + "num_tokens": 392025371.0, + "step": 3666 + }, + { + "epoch": 8.353477765108323, + "grad_norm": 2.96875, + "learning_rate": 3.627610361576353e-07, + "loss": 0.5625, + "mean_token_accuracy": 0.8826847523450851, + "num_tokens": 392132210.0, + "step": 3667 + }, + { + "epoch": 8.3557582668187, + "grad_norm": 3.0, + "learning_rate": 3.6178450934567065e-07, + "loss": 0.5909, + "mean_token_accuracy": 0.8771587163209915, + "num_tokens": 392239261.0, + "step": 3668 + }, + { + "epoch": 8.358038768529076, + "grad_norm": 3.15625, + "learning_rate": 3.6080919614353895e-07, + "loss": 0.5765, + "mean_token_accuracy": 0.8804507404565811, + "num_tokens": 392345489.0, + "step": 3669 + }, + { + "epoch": 8.360319270239453, + "grad_norm": 3.953125, + "learning_rate": 3.598350971048087e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8826466053724289, + "num_tokens": 392452753.0, + "step": 3670 + }, + { + "epoch": 8.36259977194983, + "grad_norm": 2.796875, + "learning_rate": 3.5886221278236045e-07, + "loss": 0.5594, + "mean_token_accuracy": 0.8848947435617447, + "num_tokens": 392559803.0, + "step": 3671 + }, + { + "epoch": 8.364880273660205, + "grad_norm": 3.859375, + "learning_rate": 3.578905437283833e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8808190226554871, + "num_tokens": 392666034.0, + "step": 3672 + }, + { + "epoch": 8.367160775370582, + "grad_norm": 4.21875, + "learning_rate": 3.569200904943784e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8789727836847305, + "num_tokens": 392772892.0, + "step": 3673 + }, + { + "epoch": 8.369441277080957, + "grad_norm": 3.765625, + "learning_rate": 3.559508536311568e-07, + "loss": 0.5528, + "mean_token_accuracy": 0.8866614103317261, + "num_tokens": 392880120.0, + "step": 3674 + }, + { + "epoch": 8.371721778791335, + "grad_norm": 2.765625, + "learning_rate": 3.549828336888378e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8827106803655624, + "num_tokens": 392987873.0, + "step": 3675 + }, + { + "epoch": 8.37400228050171, + "grad_norm": 2.71875, + "learning_rate": 3.5401603121685197e-07, + "loss": 0.5466, + "mean_token_accuracy": 0.8857832849025726, + "num_tokens": 393095077.0, + "step": 3676 + }, + { + "epoch": 8.376282782212087, + "grad_norm": 4.3125, + "learning_rate": 3.5305044676393645e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.8792509883642197, + "num_tokens": 393202410.0, + "step": 3677 + }, + { + "epoch": 8.378563283922462, + "grad_norm": 4.28125, + "learning_rate": 3.5208608087813874e-07, + "loss": 0.5906, + "mean_token_accuracy": 0.8767274022102356, + "num_tokens": 393309342.0, + "step": 3678 + }, + { + "epoch": 8.38084378563284, + "grad_norm": 2.609375, + "learning_rate": 3.5112293410681455e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.881255492568016, + "num_tokens": 393416921.0, + "step": 3679 + }, + { + "epoch": 8.383124287343216, + "grad_norm": 3.859375, + "learning_rate": 3.501610069966271e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.8843955397605896, + "num_tokens": 393524843.0, + "step": 3680 + }, + { + "epoch": 8.385404789053592, + "grad_norm": 2.84375, + "learning_rate": 3.492003000935487e-07, + "loss": 0.5547, + "mean_token_accuracy": 0.8885585069656372, + "num_tokens": 393632101.0, + "step": 3681 + }, + { + "epoch": 8.387685290763969, + "grad_norm": 3.671875, + "learning_rate": 3.482408139428564e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8814975172281265, + "num_tokens": 393739142.0, + "step": 3682 + }, + { + "epoch": 8.389965792474344, + "grad_norm": 3.0, + "learning_rate": 3.4728254908913683e-07, + "loss": 0.5596, + "mean_token_accuracy": 0.8834152221679688, + "num_tokens": 393846302.0, + "step": 3683 + }, + { + "epoch": 8.392246294184721, + "grad_norm": 2.90625, + "learning_rate": 3.463255060762827e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.8830453753471375, + "num_tokens": 393952665.0, + "step": 3684 + }, + { + "epoch": 8.394526795895096, + "grad_norm": 2.6875, + "learning_rate": 3.4536968544749333e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.8798201084136963, + "num_tokens": 394059602.0, + "step": 3685 + }, + { + "epoch": 8.396807297605474, + "grad_norm": 2.890625, + "learning_rate": 3.4441508774527345e-07, + "loss": 0.5612, + "mean_token_accuracy": 0.8854030817747116, + "num_tokens": 394166384.0, + "step": 3686 + }, + { + "epoch": 8.399087799315849, + "grad_norm": 3.875, + "learning_rate": 3.434617135114349e-07, + "loss": 0.5723, + "mean_token_accuracy": 0.8825076222419739, + "num_tokens": 394273389.0, + "step": 3687 + }, + { + "epoch": 8.401368301026226, + "grad_norm": 3.09375, + "learning_rate": 3.425095632870937e-07, + "loss": 0.594, + "mean_token_accuracy": 0.8796204030513763, + "num_tokens": 394380226.0, + "step": 3688 + }, + { + "epoch": 8.403648802736601, + "grad_norm": 3.734375, + "learning_rate": 3.4155863761267256e-07, + "loss": 0.5841, + "mean_token_accuracy": 0.8805309385061264, + "num_tokens": 394486821.0, + "step": 3689 + }, + { + "epoch": 8.405929304446978, + "grad_norm": 3.203125, + "learning_rate": 3.406089370278981e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8786486685276031, + "num_tokens": 394593768.0, + "step": 3690 + }, + { + "epoch": 8.408209806157355, + "grad_norm": 2.96875, + "learning_rate": 3.396604620718025e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.8826608210802078, + "num_tokens": 394700911.0, + "step": 3691 + }, + { + "epoch": 8.41049030786773, + "grad_norm": 3.109375, + "learning_rate": 3.387132132827223e-07, + "loss": 0.5861, + "mean_token_accuracy": 0.8810716569423676, + "num_tokens": 394807801.0, + "step": 3692 + }, + { + "epoch": 8.412770809578108, + "grad_norm": 2.703125, + "learning_rate": 3.377671911982963e-07, + "loss": 0.5807, + "mean_token_accuracy": 0.8786879032850266, + "num_tokens": 394914319.0, + "step": 3693 + }, + { + "epoch": 8.415051311288483, + "grad_norm": 3.046875, + "learning_rate": 3.3682239635546927e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8792416155338287, + "num_tokens": 395021338.0, + "step": 3694 + }, + { + "epoch": 8.41733181299886, + "grad_norm": 2.671875, + "learning_rate": 3.35878829290488e-07, + "loss": 0.5768, + "mean_token_accuracy": 0.8806908279657364, + "num_tokens": 395128126.0, + "step": 3695 + }, + { + "epoch": 8.419612314709235, + "grad_norm": 4.625, + "learning_rate": 3.3493649053890325e-07, + "loss": 0.5661, + "mean_token_accuracy": 0.8870299756526947, + "num_tokens": 395234907.0, + "step": 3696 + }, + { + "epoch": 8.421892816419613, + "grad_norm": 2.953125, + "learning_rate": 3.339953806355692e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.8846122622489929, + "num_tokens": 395342739.0, + "step": 3697 + }, + { + "epoch": 8.424173318129988, + "grad_norm": 5.5625, + "learning_rate": 3.330555001146399e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8779343664646149, + "num_tokens": 395449718.0, + "step": 3698 + }, + { + "epoch": 8.426453819840365, + "grad_norm": 2.796875, + "learning_rate": 3.3211684950957416e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8803275525569916, + "num_tokens": 395556795.0, + "step": 3699 + }, + { + "epoch": 8.428734321550742, + "grad_norm": 3.0, + "learning_rate": 3.311794293531323e-07, + "loss": 0.5608, + "mean_token_accuracy": 0.8883070945739746, + "num_tokens": 395664009.0, + "step": 3700 + }, + { + "epoch": 8.431014823261117, + "grad_norm": 4.125, + "learning_rate": 3.3024324017737555e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8797996789216995, + "num_tokens": 395771593.0, + "step": 3701 + }, + { + "epoch": 8.433295324971494, + "grad_norm": 3.0625, + "learning_rate": 3.2930828251366703e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.8806504160165787, + "num_tokens": 395878663.0, + "step": 3702 + }, + { + "epoch": 8.43557582668187, + "grad_norm": 3.609375, + "learning_rate": 3.283745568926708e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.8880557268857956, + "num_tokens": 395986031.0, + "step": 3703 + }, + { + "epoch": 8.437856328392247, + "grad_norm": 3.09375, + "learning_rate": 3.274420638443507e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.8808362036943436, + "num_tokens": 396093478.0, + "step": 3704 + }, + { + "epoch": 8.440136830102622, + "grad_norm": 3.6875, + "learning_rate": 3.2651080389797253e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.8885146230459213, + "num_tokens": 396200577.0, + "step": 3705 + }, + { + "epoch": 8.442417331813, + "grad_norm": 2.640625, + "learning_rate": 3.255807775821015e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8813702762126923, + "num_tokens": 396307631.0, + "step": 3706 + }, + { + "epoch": 8.444697833523374, + "grad_norm": 4.8125, + "learning_rate": 3.246519854246022e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.882218137383461, + "num_tokens": 396415007.0, + "step": 3707 + }, + { + "epoch": 8.446978335233752, + "grad_norm": 4.34375, + "learning_rate": 3.2372442795263885e-07, + "loss": 0.5712, + "mean_token_accuracy": 0.8808966130018234, + "num_tokens": 396521935.0, + "step": 3708 + }, + { + "epoch": 8.449258836944129, + "grad_norm": 3.75, + "learning_rate": 3.227981056926763e-07, + "loss": 0.597, + "mean_token_accuracy": 0.8786474019289017, + "num_tokens": 396629351.0, + "step": 3709 + }, + { + "epoch": 8.451539338654504, + "grad_norm": 3.25, + "learning_rate": 3.218730191704758e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8776755332946777, + "num_tokens": 396736310.0, + "step": 3710 + }, + { + "epoch": 8.453819840364881, + "grad_norm": 3.5625, + "learning_rate": 3.209491689110994e-07, + "loss": 0.5609, + "mean_token_accuracy": 0.8844281584024429, + "num_tokens": 396843391.0, + "step": 3711 + }, + { + "epoch": 8.456100342075256, + "grad_norm": 3.21875, + "learning_rate": 3.2002655543890646e-07, + "loss": 0.5692, + "mean_token_accuracy": 0.8826111257076263, + "num_tokens": 396950647.0, + "step": 3712 + }, + { + "epoch": 8.458380843785633, + "grad_norm": 4.3125, + "learning_rate": 3.1910517927755516e-07, + "loss": 0.5732, + "mean_token_accuracy": 0.879383772611618, + "num_tokens": 397058121.0, + "step": 3713 + }, + { + "epoch": 8.460661345496009, + "grad_norm": 2.640625, + "learning_rate": 3.181850409499995e-07, + "loss": 0.5669, + "mean_token_accuracy": 0.8790426999330521, + "num_tokens": 397165268.0, + "step": 3714 + }, + { + "epoch": 8.462941847206386, + "grad_norm": 2.453125, + "learning_rate": 3.1726614097849326e-07, + "loss": 0.5596, + "mean_token_accuracy": 0.8887834399938583, + "num_tokens": 397272308.0, + "step": 3715 + }, + { + "epoch": 8.465222348916761, + "grad_norm": 3.6875, + "learning_rate": 3.163484798845862e-07, + "loss": 0.5725, + "mean_token_accuracy": 0.8812167048454285, + "num_tokens": 397379017.0, + "step": 3716 + }, + { + "epoch": 8.467502850627138, + "grad_norm": 3.703125, + "learning_rate": 3.1543205818912484e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.8864733725786209, + "num_tokens": 397485914.0, + "step": 3717 + }, + { + "epoch": 8.469783352337513, + "grad_norm": 2.8125, + "learning_rate": 3.145168764122525e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8842067122459412, + "num_tokens": 397592552.0, + "step": 3718 + }, + { + "epoch": 8.47206385404789, + "grad_norm": 4.65625, + "learning_rate": 3.1360293507340934e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.8827557861804962, + "num_tokens": 397699338.0, + "step": 3719 + }, + { + "epoch": 8.474344355758268, + "grad_norm": 2.703125, + "learning_rate": 3.1269023469132937e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.8828287869691849, + "num_tokens": 397806861.0, + "step": 3720 + }, + { + "epoch": 8.476624857468643, + "grad_norm": 3.9375, + "learning_rate": 3.117787757840449e-07, + "loss": 0.5826, + "mean_token_accuracy": 0.8807165771722794, + "num_tokens": 397913908.0, + "step": 3721 + }, + { + "epoch": 8.47890535917902, + "grad_norm": 2.921875, + "learning_rate": 3.10868558868882e-07, + "loss": 0.5543, + "mean_token_accuracy": 0.8827250152826309, + "num_tokens": 398021549.0, + "step": 3722 + }, + { + "epoch": 8.481185860889395, + "grad_norm": 3.046875, + "learning_rate": 3.0995958446246197e-07, + "loss": 0.5783, + "mean_token_accuracy": 0.881324291229248, + "num_tokens": 398128775.0, + "step": 3723 + }, + { + "epoch": 8.483466362599772, + "grad_norm": 3.21875, + "learning_rate": 3.090518530807021e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.8850361853837967, + "num_tokens": 398235464.0, + "step": 3724 + }, + { + "epoch": 8.485746864310148, + "grad_norm": 2.59375, + "learning_rate": 3.0814536523881224e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8856307417154312, + "num_tokens": 398342478.0, + "step": 3725 + }, + { + "epoch": 8.488027366020525, + "grad_norm": 3.703125, + "learning_rate": 3.072401214512974e-07, + "loss": 0.5838, + "mean_token_accuracy": 0.8800984472036362, + "num_tokens": 398449103.0, + "step": 3726 + }, + { + "epoch": 8.4903078677309, + "grad_norm": 3.0625, + "learning_rate": 3.063361222319569e-07, + "loss": 0.5692, + "mean_token_accuracy": 0.8843390345573425, + "num_tokens": 398556171.0, + "step": 3727 + }, + { + "epoch": 8.492588369441277, + "grad_norm": 3.71875, + "learning_rate": 3.054333680938837e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.8837459683418274, + "num_tokens": 398663128.0, + "step": 3728 + }, + { + "epoch": 8.494868871151652, + "grad_norm": 4.25, + "learning_rate": 3.045318595494623e-07, + "loss": 0.5668, + "mean_token_accuracy": 0.8831415474414825, + "num_tokens": 398770220.0, + "step": 3729 + }, + { + "epoch": 8.49714937286203, + "grad_norm": 2.796875, + "learning_rate": 3.036315971103723e-07, + "loss": 0.5735, + "mean_token_accuracy": 0.8805525749921799, + "num_tokens": 398876750.0, + "step": 3730 + }, + { + "epoch": 8.499429874572407, + "grad_norm": 2.796875, + "learning_rate": 3.0273258128758585e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8824233710765839, + "num_tokens": 398983846.0, + "step": 3731 + }, + { + "epoch": 8.501710376282782, + "grad_norm": 2.671875, + "learning_rate": 3.018348125913659e-07, + "loss": 0.5686, + "mean_token_accuracy": 0.8839138001203537, + "num_tokens": 399090705.0, + "step": 3732 + }, + { + "epoch": 8.503990877993159, + "grad_norm": 3.015625, + "learning_rate": 3.009382915312689e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8818164467811584, + "num_tokens": 399197937.0, + "step": 3733 + }, + { + "epoch": 8.506271379703534, + "grad_norm": 4.03125, + "learning_rate": 3.000430186161432e-07, + "loss": 0.5653, + "mean_token_accuracy": 0.8860601335763931, + "num_tokens": 399304810.0, + "step": 3734 + }, + { + "epoch": 8.508551881413911, + "grad_norm": 3.265625, + "learning_rate": 2.991489943541287e-07, + "loss": 0.5684, + "mean_token_accuracy": 0.8790812194347382, + "num_tokens": 399411951.0, + "step": 3735 + }, + { + "epoch": 8.510832383124287, + "grad_norm": 2.96875, + "learning_rate": 2.982562192526556e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.881022498011589, + "num_tokens": 399519175.0, + "step": 3736 + }, + { + "epoch": 8.513112884834664, + "grad_norm": 3.640625, + "learning_rate": 2.97364693818446e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8782191127538681, + "num_tokens": 399626295.0, + "step": 3737 + }, + { + "epoch": 8.515393386545039, + "grad_norm": 2.90625, + "learning_rate": 2.9647441855751274e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.8843040764331818, + "num_tokens": 399733595.0, + "step": 3738 + }, + { + "epoch": 8.517673888255416, + "grad_norm": 4.59375, + "learning_rate": 2.9558539397515905e-07, + "loss": 0.5624, + "mean_token_accuracy": 0.884150817990303, + "num_tokens": 399840730.0, + "step": 3739 + }, + { + "epoch": 8.519954389965793, + "grad_norm": 3.25, + "learning_rate": 2.94697620575978e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8824001997709274, + "num_tokens": 399947355.0, + "step": 3740 + }, + { + "epoch": 8.519954389965793, + "eval_loss": 0.586450457572937, + "eval_mean_token_accuracy": 0.8799830790708274, + "eval_num_tokens": 399947355.0, + "eval_runtime": 58.61, + "eval_samples_per_second": 143.064, + "eval_steps_per_second": 4.487, + "step": 3740 + }, + { + "epoch": 8.522234891676169, + "grad_norm": 3.0, + "learning_rate": 2.938110988638521e-07, + "loss": 0.5732, + "mean_token_accuracy": 0.8857548832893372, + "num_tokens": 400053792.0, + "step": 3741 + }, + { + "epoch": 8.524515393386546, + "grad_norm": 2.625, + "learning_rate": 2.9292582934195427e-07, + "loss": 0.5805, + "mean_token_accuracy": 0.8823639154434204, + "num_tokens": 400160716.0, + "step": 3742 + }, + { + "epoch": 8.526795895096921, + "grad_norm": 4.75, + "learning_rate": 2.9204181251274665e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8819014728069305, + "num_tokens": 400267286.0, + "step": 3743 + }, + { + "epoch": 8.529076396807298, + "grad_norm": 3.234375, + "learning_rate": 2.9115904887798005e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8814072459936142, + "num_tokens": 400374052.0, + "step": 3744 + }, + { + "epoch": 8.531356898517673, + "grad_norm": 2.84375, + "learning_rate": 2.9027753893869387e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8873879760503769, + "num_tokens": 400481323.0, + "step": 3745 + }, + { + "epoch": 8.53363740022805, + "grad_norm": 3.890625, + "learning_rate": 2.893972831952166e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.8847419172525406, + "num_tokens": 400588604.0, + "step": 3746 + }, + { + "epoch": 8.535917901938426, + "grad_norm": 2.765625, + "learning_rate": 2.8851828214716383e-07, + "loss": 0.5778, + "mean_token_accuracy": 0.8783596158027649, + "num_tokens": 400695471.0, + "step": 3747 + }, + { + "epoch": 8.538198403648803, + "grad_norm": 2.859375, + "learning_rate": 2.876405362934395e-07, + "loss": 0.5835, + "mean_token_accuracy": 0.8776167929172516, + "num_tokens": 400802607.0, + "step": 3748 + }, + { + "epoch": 8.54047890535918, + "grad_norm": 2.96875, + "learning_rate": 2.8676404613223573e-07, + "loss": 0.5807, + "mean_token_accuracy": 0.8812108039855957, + "num_tokens": 400909340.0, + "step": 3749 + }, + { + "epoch": 8.542759407069555, + "grad_norm": 4.0, + "learning_rate": 2.858888121610315e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.887622594833374, + "num_tokens": 401015859.0, + "step": 3750 + }, + { + "epoch": 8.545039908779932, + "grad_norm": 3.4375, + "learning_rate": 2.8501483487659217e-07, + "loss": 0.5961, + "mean_token_accuracy": 0.8768871873617172, + "num_tokens": 401122411.0, + "step": 3751 + }, + { + "epoch": 8.547320410490308, + "grad_norm": 2.578125, + "learning_rate": 2.841421147749709e-07, + "loss": 0.5755, + "mean_token_accuracy": 0.8786147236824036, + "num_tokens": 401229660.0, + "step": 3752 + }, + { + "epoch": 8.549600912200685, + "grad_norm": 2.65625, + "learning_rate": 2.832706523515061e-07, + "loss": 0.5717, + "mean_token_accuracy": 0.8837973475456238, + "num_tokens": 401336325.0, + "step": 3753 + }, + { + "epoch": 8.55188141391106, + "grad_norm": 2.890625, + "learning_rate": 2.824004481008233e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.8784725517034531, + "num_tokens": 401444403.0, + "step": 3754 + }, + { + "epoch": 8.554161915621437, + "grad_norm": 3.15625, + "learning_rate": 2.815315025168339e-07, + "loss": 0.5673, + "mean_token_accuracy": 0.880395233631134, + "num_tokens": 401551536.0, + "step": 3755 + }, + { + "epoch": 8.556442417331812, + "grad_norm": 2.578125, + "learning_rate": 2.8066381609273497e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.8854324221611023, + "num_tokens": 401658365.0, + "step": 3756 + }, + { + "epoch": 8.55872291904219, + "grad_norm": 3.09375, + "learning_rate": 2.7979738932100734e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8865665048360825, + "num_tokens": 401765495.0, + "step": 3757 + }, + { + "epoch": 8.561003420752566, + "grad_norm": 3.578125, + "learning_rate": 2.7893222269341906e-07, + "loss": 0.5725, + "mean_token_accuracy": 0.8840383142232895, + "num_tokens": 401872546.0, + "step": 3758 + }, + { + "epoch": 8.563283922462942, + "grad_norm": 3.171875, + "learning_rate": 2.7806831670102176e-07, + "loss": 0.5856, + "mean_token_accuracy": 0.8813675791025162, + "num_tokens": 401979397.0, + "step": 3759 + }, + { + "epoch": 8.565564424173319, + "grad_norm": 2.875, + "learning_rate": 2.7720567183415175e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8806767165660858, + "num_tokens": 402086207.0, + "step": 3760 + }, + { + "epoch": 8.567844925883694, + "grad_norm": 3.015625, + "learning_rate": 2.7634428858242995e-07, + "loss": 0.5909, + "mean_token_accuracy": 0.8802158385515213, + "num_tokens": 402192757.0, + "step": 3761 + }, + { + "epoch": 8.570125427594071, + "grad_norm": 2.796875, + "learning_rate": 2.754841674347608e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.8806091845035553, + "num_tokens": 402299546.0, + "step": 3762 + }, + { + "epoch": 8.572405929304447, + "grad_norm": 3.46875, + "learning_rate": 2.7462530887933216e-07, + "loss": 0.559, + "mean_token_accuracy": 0.8859734088182449, + "num_tokens": 402406468.0, + "step": 3763 + }, + { + "epoch": 8.574686431014824, + "grad_norm": 3.375, + "learning_rate": 2.737677134036154e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.8834273219108582, + "num_tokens": 402513385.0, + "step": 3764 + }, + { + "epoch": 8.576966932725199, + "grad_norm": 3.015625, + "learning_rate": 2.729113814943654e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8773778825998306, + "num_tokens": 402620374.0, + "step": 3765 + }, + { + "epoch": 8.579247434435576, + "grad_norm": 3.25, + "learning_rate": 2.7205631363761976e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8817746788263321, + "num_tokens": 402727727.0, + "step": 3766 + }, + { + "epoch": 8.581527936145951, + "grad_norm": 2.90625, + "learning_rate": 2.7120251031869884e-07, + "loss": 0.5487, + "mean_token_accuracy": 0.886865645647049, + "num_tokens": 402834747.0, + "step": 3767 + }, + { + "epoch": 8.583808437856328, + "grad_norm": 3.0, + "learning_rate": 2.7034997202220384e-07, + "loss": 0.5794, + "mean_token_accuracy": 0.8793386965990067, + "num_tokens": 402941219.0, + "step": 3768 + }, + { + "epoch": 8.586088939566705, + "grad_norm": 2.953125, + "learning_rate": 2.6949869923202e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8822144120931625, + "num_tokens": 403048515.0, + "step": 3769 + }, + { + "epoch": 8.58836944127708, + "grad_norm": 3.0, + "learning_rate": 2.686486924313128e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8802978694438934, + "num_tokens": 403156281.0, + "step": 3770 + }, + { + "epoch": 8.590649942987458, + "grad_norm": 2.8125, + "learning_rate": 2.6779995210253015e-07, + "loss": 0.5822, + "mean_token_accuracy": 0.8790141940116882, + "num_tokens": 403263439.0, + "step": 3771 + }, + { + "epoch": 8.592930444697833, + "grad_norm": 3.21875, + "learning_rate": 2.6695247872740027e-07, + "loss": 0.5862, + "mean_token_accuracy": 0.8814330548048019, + "num_tokens": 403370460.0, + "step": 3772 + }, + { + "epoch": 8.59521094640821, + "grad_norm": 2.53125, + "learning_rate": 2.6610627278693265e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8820819407701492, + "num_tokens": 403477203.0, + "step": 3773 + }, + { + "epoch": 8.597491448118586, + "grad_norm": 2.65625, + "learning_rate": 2.6526133476141804e-07, + "loss": 0.5748, + "mean_token_accuracy": 0.8831133842468262, + "num_tokens": 403583814.0, + "step": 3774 + }, + { + "epoch": 8.599771949828963, + "grad_norm": 3.765625, + "learning_rate": 2.644176651304259e-07, + "loss": 0.5623, + "mean_token_accuracy": 0.8832801431417465, + "num_tokens": 403690937.0, + "step": 3775 + }, + { + "epoch": 8.602052451539338, + "grad_norm": 2.671875, + "learning_rate": 2.6357526437280764e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.8839538246393204, + "num_tokens": 403798148.0, + "step": 3776 + }, + { + "epoch": 8.604332953249715, + "grad_norm": 2.828125, + "learning_rate": 2.6273413296669353e-07, + "loss": 0.5672, + "mean_token_accuracy": 0.8829332441091537, + "num_tokens": 403904875.0, + "step": 3777 + }, + { + "epoch": 8.60661345496009, + "grad_norm": 2.75, + "learning_rate": 2.618942713894937e-07, + "loss": 0.5744, + "mean_token_accuracy": 0.8815491497516632, + "num_tokens": 404011913.0, + "step": 3778 + }, + { + "epoch": 8.608893956670467, + "grad_norm": 2.703125, + "learning_rate": 2.610556801178968e-07, + "loss": 0.5873, + "mean_token_accuracy": 0.8805450052022934, + "num_tokens": 404119324.0, + "step": 3779 + }, + { + "epoch": 8.611174458380844, + "grad_norm": 2.578125, + "learning_rate": 2.602183596278715e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.889171689748764, + "num_tokens": 404226267.0, + "step": 3780 + }, + { + "epoch": 8.61345496009122, + "grad_norm": 2.8125, + "learning_rate": 2.5938231039466436e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8866444677114487, + "num_tokens": 404333323.0, + "step": 3781 + }, + { + "epoch": 8.615735461801597, + "grad_norm": 2.734375, + "learning_rate": 2.585475328928011e-07, + "loss": 0.5725, + "mean_token_accuracy": 0.8843964338302612, + "num_tokens": 404440123.0, + "step": 3782 + }, + { + "epoch": 8.618015963511972, + "grad_norm": 3.71875, + "learning_rate": 2.577140275960857e-07, + "loss": 0.5852, + "mean_token_accuracy": 0.882148802280426, + "num_tokens": 404546230.0, + "step": 3783 + }, + { + "epoch": 8.62029646522235, + "grad_norm": 2.671875, + "learning_rate": 2.5688179497759895e-07, + "loss": 0.5542, + "mean_token_accuracy": 0.8827191293239594, + "num_tokens": 404653034.0, + "step": 3784 + }, + { + "epoch": 8.622576966932725, + "grad_norm": 3.140625, + "learning_rate": 2.560508355097002e-07, + "loss": 0.5588, + "mean_token_accuracy": 0.8882575631141663, + "num_tokens": 404760581.0, + "step": 3785 + }, + { + "epoch": 8.624857468643102, + "grad_norm": 3.21875, + "learning_rate": 2.552211496640261e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.8854646235704422, + "num_tokens": 404868271.0, + "step": 3786 + }, + { + "epoch": 8.627137970353477, + "grad_norm": 2.375, + "learning_rate": 2.543927379114902e-07, + "loss": 0.5736, + "mean_token_accuracy": 0.8811837881803513, + "num_tokens": 404974915.0, + "step": 3787 + }, + { + "epoch": 8.629418472063854, + "grad_norm": 2.875, + "learning_rate": 2.5356560072228335e-07, + "loss": 0.5688, + "mean_token_accuracy": 0.8805698156356812, + "num_tokens": 405081754.0, + "step": 3788 + }, + { + "epoch": 8.631698973774231, + "grad_norm": 2.921875, + "learning_rate": 2.5273973856587283e-07, + "loss": 0.579, + "mean_token_accuracy": 0.879248857498169, + "num_tokens": 405188743.0, + "step": 3789 + }, + { + "epoch": 8.633979475484606, + "grad_norm": 3.046875, + "learning_rate": 2.5191515191100107e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8826890587806702, + "num_tokens": 405296046.0, + "step": 3790 + }, + { + "epoch": 8.636259977194984, + "grad_norm": 3.71875, + "learning_rate": 2.5109184122568797e-07, + "loss": 0.5745, + "mean_token_accuracy": 0.88176129758358, + "num_tokens": 405402833.0, + "step": 3791 + }, + { + "epoch": 8.638540478905359, + "grad_norm": 2.796875, + "learning_rate": 2.502698069772294e-07, + "loss": 0.5776, + "mean_token_accuracy": 0.8807597756385803, + "num_tokens": 405509745.0, + "step": 3792 + }, + { + "epoch": 8.640820980615736, + "grad_norm": 2.953125, + "learning_rate": 2.494490496321958e-07, + "loss": 0.5805, + "mean_token_accuracy": 0.8785258382558823, + "num_tokens": 405616827.0, + "step": 3793 + }, + { + "epoch": 8.643101482326111, + "grad_norm": 2.375, + "learning_rate": 2.4862956965643253e-07, + "loss": 0.582, + "mean_token_accuracy": 0.8794488459825516, + "num_tokens": 405723490.0, + "step": 3794 + }, + { + "epoch": 8.645381984036488, + "grad_norm": 3.0625, + "learning_rate": 2.4781136751506176e-07, + "loss": 0.5634, + "mean_token_accuracy": 0.8818620294332504, + "num_tokens": 405830225.0, + "step": 3795 + }, + { + "epoch": 8.647662485746864, + "grad_norm": 3.171875, + "learning_rate": 2.4699444367247834e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.886525347828865, + "num_tokens": 405937278.0, + "step": 3796 + }, + { + "epoch": 8.64994298745724, + "grad_norm": 2.625, + "learning_rate": 2.461787985923525e-07, + "loss": 0.5698, + "mean_token_accuracy": 0.8777887523174286, + "num_tokens": 406044118.0, + "step": 3797 + }, + { + "epoch": 8.652223489167618, + "grad_norm": 3.21875, + "learning_rate": 2.4536443273762864e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.8797616511583328, + "num_tokens": 406151330.0, + "step": 3798 + }, + { + "epoch": 8.654503990877993, + "grad_norm": 5.15625, + "learning_rate": 2.4455134657052626e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8845881819725037, + "num_tokens": 406258516.0, + "step": 3799 + }, + { + "epoch": 8.65678449258837, + "grad_norm": 2.9375, + "learning_rate": 2.437395405525356e-07, + "loss": 0.5844, + "mean_token_accuracy": 0.8821278810501099, + "num_tokens": 406364734.0, + "step": 3800 + }, + { + "epoch": 8.659064994298745, + "grad_norm": 4.46875, + "learning_rate": 2.429290151444233e-07, + "loss": 0.5531, + "mean_token_accuracy": 0.8842455595731735, + "num_tokens": 406471849.0, + "step": 3801 + }, + { + "epoch": 8.661345496009123, + "grad_norm": 3.359375, + "learning_rate": 2.421197708062273e-07, + "loss": 0.5721, + "mean_token_accuracy": 0.8807191848754883, + "num_tokens": 406579175.0, + "step": 3802 + }, + { + "epoch": 8.663625997719498, + "grad_norm": 4.59375, + "learning_rate": 2.413118079972593e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.8790168464183807, + "num_tokens": 406686082.0, + "step": 3803 + }, + { + "epoch": 8.665906499429875, + "grad_norm": 4.0, + "learning_rate": 2.405051271761036e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.8817109614610672, + "num_tokens": 406793281.0, + "step": 3804 + }, + { + "epoch": 8.66818700114025, + "grad_norm": 3.234375, + "learning_rate": 2.396997288006167e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.8831426650285721, + "num_tokens": 406900397.0, + "step": 3805 + }, + { + "epoch": 8.670467502850627, + "grad_norm": 2.984375, + "learning_rate": 2.388956133279266e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.885161817073822, + "num_tokens": 407007825.0, + "step": 3806 + }, + { + "epoch": 8.672748004561003, + "grad_norm": 2.875, + "learning_rate": 2.3809278121443403e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.8806300610303879, + "num_tokens": 407115154.0, + "step": 3807 + }, + { + "epoch": 8.67502850627138, + "grad_norm": 4.625, + "learning_rate": 2.3729123291581112e-07, + "loss": 0.5783, + "mean_token_accuracy": 0.8829029500484467, + "num_tokens": 407222499.0, + "step": 3808 + }, + { + "epoch": 8.677309007981757, + "grad_norm": 2.96875, + "learning_rate": 2.3649096888700095e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8817735910415649, + "num_tokens": 407329789.0, + "step": 3809 + }, + { + "epoch": 8.679589509692132, + "grad_norm": 2.421875, + "learning_rate": 2.356919895822188e-07, + "loss": 0.5606, + "mean_token_accuracy": 0.885416716337204, + "num_tokens": 407436906.0, + "step": 3810 + }, + { + "epoch": 8.68187001140251, + "grad_norm": 2.859375, + "learning_rate": 2.3489429545494851e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.8833657205104828, + "num_tokens": 407544131.0, + "step": 3811 + }, + { + "epoch": 8.684150513112884, + "grad_norm": 2.625, + "learning_rate": 2.3409788695794688e-07, + "loss": 0.5696, + "mean_token_accuracy": 0.8843205720186234, + "num_tokens": 407652008.0, + "step": 3812 + }, + { + "epoch": 8.686431014823262, + "grad_norm": 2.9375, + "learning_rate": 2.3330276454323926e-07, + "loss": 0.5669, + "mean_token_accuracy": 0.8810292482376099, + "num_tokens": 407759542.0, + "step": 3813 + }, + { + "epoch": 8.688711516533637, + "grad_norm": 3.03125, + "learning_rate": 2.3250892866212294e-07, + "loss": 0.5823, + "mean_token_accuracy": 0.8779249489307404, + "num_tokens": 407865934.0, + "step": 3814 + }, + { + "epoch": 8.690992018244014, + "grad_norm": 3.046875, + "learning_rate": 2.3171637976516253e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8835948258638382, + "num_tokens": 407973474.0, + "step": 3815 + }, + { + "epoch": 8.69327251995439, + "grad_norm": 2.78125, + "learning_rate": 2.3092511830219405e-07, + "loss": 0.5723, + "mean_token_accuracy": 0.8819569945335388, + "num_tokens": 408080757.0, + "step": 3816 + }, + { + "epoch": 8.695553021664766, + "grad_norm": 2.671875, + "learning_rate": 2.3013514472232295e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8784191906452179, + "num_tokens": 408188778.0, + "step": 3817 + }, + { + "epoch": 8.697833523375142, + "grad_norm": 2.421875, + "learning_rate": 2.293464594739214e-07, + "loss": 0.5593, + "mean_token_accuracy": 0.8858485221862793, + "num_tokens": 408295942.0, + "step": 3818 + }, + { + "epoch": 8.700114025085519, + "grad_norm": 2.921875, + "learning_rate": 2.2855906300463305e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8777198940515518, + "num_tokens": 408402995.0, + "step": 3819 + }, + { + "epoch": 8.702394526795896, + "grad_norm": 3.109375, + "learning_rate": 2.2777295576136865e-07, + "loss": 0.5557, + "mean_token_accuracy": 0.888022854924202, + "num_tokens": 408510620.0, + "step": 3820 + }, + { + "epoch": 8.704675028506271, + "grad_norm": 3.203125, + "learning_rate": 2.2698813819030802e-07, + "loss": 0.567, + "mean_token_accuracy": 0.8824697285890579, + "num_tokens": 408617724.0, + "step": 3821 + }, + { + "epoch": 8.706955530216648, + "grad_norm": 3.484375, + "learning_rate": 2.2620461073689732e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8824862539768219, + "num_tokens": 408724694.0, + "step": 3822 + }, + { + "epoch": 8.709236031927023, + "grad_norm": 3.640625, + "learning_rate": 2.254223738458522e-07, + "loss": 0.5593, + "mean_token_accuracy": 0.8855371475219727, + "num_tokens": 408832529.0, + "step": 3823 + }, + { + "epoch": 8.7115165336374, + "grad_norm": 2.703125, + "learning_rate": 2.2464142796115557e-07, + "loss": 0.553, + "mean_token_accuracy": 0.8828513324260712, + "num_tokens": 408939774.0, + "step": 3824 + }, + { + "epoch": 8.713797035347776, + "grad_norm": 3.125, + "learning_rate": 2.2386177352605677e-07, + "loss": 0.5758, + "mean_token_accuracy": 0.8790740817785263, + "num_tokens": 409046308.0, + "step": 3825 + }, + { + "epoch": 8.716077537058153, + "grad_norm": 2.546875, + "learning_rate": 2.2308341098307318e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8837715536355972, + "num_tokens": 409153352.0, + "step": 3826 + }, + { + "epoch": 8.718358038768528, + "grad_norm": 3.0, + "learning_rate": 2.2230634077398755e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8824053555727005, + "num_tokens": 409260531.0, + "step": 3827 + }, + { + "epoch": 8.720638540478905, + "grad_norm": 3.0, + "learning_rate": 2.2153056333985014e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.8850871324539185, + "num_tokens": 409367568.0, + "step": 3828 + }, + { + "epoch": 8.722919042189282, + "grad_norm": 3.96875, + "learning_rate": 2.2075607912097758e-07, + "loss": 0.566, + "mean_token_accuracy": 0.8864067494869232, + "num_tokens": 409474914.0, + "step": 3829 + }, + { + "epoch": 8.725199543899658, + "grad_norm": 5.375, + "learning_rate": 2.1998288855695189e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8827249854803085, + "num_tokens": 409581981.0, + "step": 3830 + }, + { + "epoch": 8.727480045610035, + "grad_norm": 3.46875, + "learning_rate": 2.1921099208662173e-07, + "loss": 0.5828, + "mean_token_accuracy": 0.8793773353099823, + "num_tokens": 409689221.0, + "step": 3831 + }, + { + "epoch": 8.72976054732041, + "grad_norm": 4.1875, + "learning_rate": 2.184403901480997e-07, + "loss": 0.5921, + "mean_token_accuracy": 0.8778216391801834, + "num_tokens": 409796146.0, + "step": 3832 + }, + { + "epoch": 8.732041049030787, + "grad_norm": 3.28125, + "learning_rate": 2.176710831787651e-07, + "loss": 0.565, + "mean_token_accuracy": 0.8842185884714127, + "num_tokens": 409902678.0, + "step": 3833 + }, + { + "epoch": 8.734321550741162, + "grad_norm": 2.921875, + "learning_rate": 2.1690307161526148e-07, + "loss": 0.5482, + "mean_token_accuracy": 0.8871366381645203, + "num_tokens": 410010272.0, + "step": 3834 + }, + { + "epoch": 8.73660205245154, + "grad_norm": 3.890625, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.5492, + "mean_token_accuracy": 0.885500431060791, + "num_tokens": 410117832.0, + "step": 3835 + }, + { + "epoch": 8.738882554161915, + "grad_norm": 2.84375, + "learning_rate": 2.153709364486467e-07, + "loss": 0.5601, + "mean_token_accuracy": 0.8852385729551315, + "num_tokens": 410225406.0, + "step": 3836 + }, + { + "epoch": 8.741163055872292, + "grad_norm": 2.640625, + "learning_rate": 2.1460681371514552e-07, + "loss": 0.5574, + "mean_token_accuracy": 0.8837675005197525, + "num_tokens": 410332633.0, + "step": 3837 + }, + { + "epoch": 8.743443557582669, + "grad_norm": 2.703125, + "learning_rate": 2.13843988126696e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.8787273913621902, + "num_tokens": 410439603.0, + "step": 3838 + }, + { + "epoch": 8.745724059293044, + "grad_norm": 3.421875, + "learning_rate": 2.130824601162626e-07, + "loss": 0.5633, + "mean_token_accuracy": 0.8858155012130737, + "num_tokens": 410546706.0, + "step": 3839 + }, + { + "epoch": 8.748004561003421, + "grad_norm": 3.5625, + "learning_rate": 2.1232223011607406e-07, + "loss": 0.5916, + "mean_token_accuracy": 0.8737548291683197, + "num_tokens": 410653837.0, + "step": 3840 + }, + { + "epoch": 8.750285062713797, + "grad_norm": 2.828125, + "learning_rate": 2.1156329855762243e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8844601809978485, + "num_tokens": 410761316.0, + "step": 3841 + }, + { + "epoch": 8.752565564424174, + "grad_norm": 3.234375, + "learning_rate": 2.1080566587166286e-07, + "loss": 0.5618, + "mean_token_accuracy": 0.8839665204286575, + "num_tokens": 410868322.0, + "step": 3842 + }, + { + "epoch": 8.754846066134549, + "grad_norm": 3.25, + "learning_rate": 2.1004933248821247e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8804609328508377, + "num_tokens": 410976264.0, + "step": 3843 + }, + { + "epoch": 8.757126567844926, + "grad_norm": 3.578125, + "learning_rate": 2.0929429883655151e-07, + "loss": 0.5793, + "mean_token_accuracy": 0.8776623606681824, + "num_tokens": 411083822.0, + "step": 3844 + }, + { + "epoch": 8.759407069555301, + "grad_norm": 3.484375, + "learning_rate": 2.08540565345223e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8814112395048141, + "num_tokens": 411191400.0, + "step": 3845 + }, + { + "epoch": 8.761687571265679, + "grad_norm": 3.921875, + "learning_rate": 2.0778813244203111e-07, + "loss": 0.5745, + "mean_token_accuracy": 0.8791114687919617, + "num_tokens": 411298004.0, + "step": 3846 + }, + { + "epoch": 8.763968072976056, + "grad_norm": 2.828125, + "learning_rate": 2.0703700055404285e-07, + "loss": 0.5826, + "mean_token_accuracy": 0.879044771194458, + "num_tokens": 411404179.0, + "step": 3847 + }, + { + "epoch": 8.766248574686431, + "grad_norm": 3.015625, + "learning_rate": 2.0628717010758526e-07, + "loss": 0.5598, + "mean_token_accuracy": 0.8879903703927994, + "num_tokens": 411511695.0, + "step": 3848 + }, + { + "epoch": 8.768529076396808, + "grad_norm": 2.8125, + "learning_rate": 2.0553864152824815e-07, + "loss": 0.5709, + "mean_token_accuracy": 0.8788136094808578, + "num_tokens": 411619621.0, + "step": 3849 + }, + { + "epoch": 8.770809578107183, + "grad_norm": 2.71875, + "learning_rate": 2.0479141524088169e-07, + "loss": 0.5834, + "mean_token_accuracy": 0.883419394493103, + "num_tokens": 411726749.0, + "step": 3850 + }, + { + "epoch": 8.77309007981756, + "grad_norm": 3.0, + "learning_rate": 2.040454916695972e-07, + "loss": 0.5781, + "mean_token_accuracy": 0.8799424916505814, + "num_tokens": 411833865.0, + "step": 3851 + }, + { + "epoch": 8.775370581527936, + "grad_norm": 2.46875, + "learning_rate": 2.0330087123776655e-07, + "loss": 0.5619, + "mean_token_accuracy": 0.884945884346962, + "num_tokens": 411941568.0, + "step": 3852 + }, + { + "epoch": 8.777651083238313, + "grad_norm": 2.8125, + "learning_rate": 2.0255755436802248e-07, + "loss": 0.5659, + "mean_token_accuracy": 0.884887769818306, + "num_tokens": 412048897.0, + "step": 3853 + }, + { + "epoch": 8.779931584948688, + "grad_norm": 3.640625, + "learning_rate": 2.0181554148225618e-07, + "loss": 0.5732, + "mean_token_accuracy": 0.8816755414009094, + "num_tokens": 412155508.0, + "step": 3854 + }, + { + "epoch": 8.782212086659065, + "grad_norm": 3.34375, + "learning_rate": 2.0107483300162018e-07, + "loss": 0.5635, + "mean_token_accuracy": 0.885670393705368, + "num_tokens": 412262951.0, + "step": 3855 + }, + { + "epoch": 8.78449258836944, + "grad_norm": 2.546875, + "learning_rate": 2.0033542934652679e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8826614916324615, + "num_tokens": 412370424.0, + "step": 3856 + }, + { + "epoch": 8.786773090079818, + "grad_norm": 2.734375, + "learning_rate": 1.9959733093664696e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8853038400411606, + "num_tokens": 412478059.0, + "step": 3857 + }, + { + "epoch": 8.789053591790195, + "grad_norm": 2.875, + "learning_rate": 1.9886053819091116e-07, + "loss": 0.5739, + "mean_token_accuracy": 0.8780199140310287, + "num_tokens": 412584823.0, + "step": 3858 + }, + { + "epoch": 8.79133409350057, + "grad_norm": 3.671875, + "learning_rate": 1.981250515275085e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8827866017818451, + "num_tokens": 412692334.0, + "step": 3859 + }, + { + "epoch": 8.793614595210947, + "grad_norm": 3.203125, + "learning_rate": 1.973908713638878e-07, + "loss": 0.5509, + "mean_token_accuracy": 0.8867141306400299, + "num_tokens": 412799411.0, + "step": 3860 + }, + { + "epoch": 8.795895096921322, + "grad_norm": 2.65625, + "learning_rate": 1.9665799811675407e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.8858144879341125, + "num_tokens": 412906629.0, + "step": 3861 + }, + { + "epoch": 8.7981755986317, + "grad_norm": 2.734375, + "learning_rate": 1.959264322020732e-07, + "loss": 0.5846, + "mean_token_accuracy": 0.8803358823060989, + "num_tokens": 413013873.0, + "step": 3862 + }, + { + "epoch": 8.800456100342075, + "grad_norm": 4.15625, + "learning_rate": 1.9519617403506747e-07, + "loss": 0.5746, + "mean_token_accuracy": 0.8833809643983841, + "num_tokens": 413120858.0, + "step": 3863 + }, + { + "epoch": 8.802736602052452, + "grad_norm": 3.0, + "learning_rate": 1.9446722403021757e-07, + "loss": 0.5868, + "mean_token_accuracy": 0.8800954222679138, + "num_tokens": 413227696.0, + "step": 3864 + }, + { + "epoch": 8.805017103762827, + "grad_norm": 2.90625, + "learning_rate": 1.9373958260126113e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8825733810663223, + "num_tokens": 413334739.0, + "step": 3865 + }, + { + "epoch": 8.807297605473204, + "grad_norm": 3.609375, + "learning_rate": 1.9301325016119338e-07, + "loss": 0.5645, + "mean_token_accuracy": 0.8862461596727371, + "num_tokens": 413441973.0, + "step": 3866 + }, + { + "epoch": 8.80957810718358, + "grad_norm": 3.03125, + "learning_rate": 1.9228822712226675e-07, + "loss": 0.5728, + "mean_token_accuracy": 0.880901426076889, + "num_tokens": 413549047.0, + "step": 3867 + }, + { + "epoch": 8.811858608893957, + "grad_norm": 3.1875, + "learning_rate": 1.915645138959904e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8816043883562088, + "num_tokens": 413656651.0, + "step": 3868 + }, + { + "epoch": 8.814139110604334, + "grad_norm": 3.671875, + "learning_rate": 1.908421108931302e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8817258775234222, + "num_tokens": 413763796.0, + "step": 3869 + }, + { + "epoch": 8.816419612314709, + "grad_norm": 3.484375, + "learning_rate": 1.9012101852370763e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.8846326470375061, + "num_tokens": 413871019.0, + "step": 3870 + }, + { + "epoch": 8.818700114025086, + "grad_norm": 3.453125, + "learning_rate": 1.894012371970008e-07, + "loss": 0.556, + "mean_token_accuracy": 0.8854041546583176, + "num_tokens": 413978295.0, + "step": 3871 + }, + { + "epoch": 8.820980615735461, + "grad_norm": 3.21875, + "learning_rate": 1.8868276732154384e-07, + "loss": 0.5792, + "mean_token_accuracy": 0.8823095858097076, + "num_tokens": 414085389.0, + "step": 3872 + }, + { + "epoch": 8.823261117445838, + "grad_norm": 3.296875, + "learning_rate": 1.879656093051266e-07, + "loss": 0.5627, + "mean_token_accuracy": 0.8850487917661667, + "num_tokens": 414192622.0, + "step": 3873 + }, + { + "epoch": 8.825541619156214, + "grad_norm": 3.28125, + "learning_rate": 1.872497635547943e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8869512677192688, + "num_tokens": 414299632.0, + "step": 3874 + }, + { + "epoch": 8.82782212086659, + "grad_norm": 2.6875, + "learning_rate": 1.8653523047684642e-07, + "loss": 0.5858, + "mean_token_accuracy": 0.882158100605011, + "num_tokens": 414406824.0, + "step": 3875 + }, + { + "epoch": 8.830102622576966, + "grad_norm": 3.203125, + "learning_rate": 1.858220104768385e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8781119138002396, + "num_tokens": 414513673.0, + "step": 3876 + }, + { + "epoch": 8.832383124287343, + "grad_norm": 2.8125, + "learning_rate": 1.8511010395958067e-07, + "loss": 0.5843, + "mean_token_accuracy": 0.8815079480409622, + "num_tokens": 414621071.0, + "step": 3877 + }, + { + "epoch": 8.83466362599772, + "grad_norm": 2.765625, + "learning_rate": 1.843995113291372e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8865767568349838, + "num_tokens": 414728284.0, + "step": 3878 + }, + { + "epoch": 8.836944127708096, + "grad_norm": 2.59375, + "learning_rate": 1.836902329888268e-07, + "loss": 0.5876, + "mean_token_accuracy": 0.8805812895298004, + "num_tokens": 414835107.0, + "step": 3879 + }, + { + "epoch": 8.839224629418473, + "grad_norm": 3.875, + "learning_rate": 1.829822693412217e-07, + "loss": 0.5468, + "mean_token_accuracy": 0.8886751085519791, + "num_tokens": 414942335.0, + "step": 3880 + }, + { + "epoch": 8.841505131128848, + "grad_norm": 3.984375, + "learning_rate": 1.8227562078814903e-07, + "loss": 0.5498, + "mean_token_accuracy": 0.8845852613449097, + "num_tokens": 415050100.0, + "step": 3881 + }, + { + "epoch": 8.843785632839225, + "grad_norm": 3.6875, + "learning_rate": 1.815702877306888e-07, + "loss": 0.5837, + "mean_token_accuracy": 0.8805902749300003, + "num_tokens": 415157356.0, + "step": 3882 + }, + { + "epoch": 8.8460661345496, + "grad_norm": 5.375, + "learning_rate": 1.8086627056917382e-07, + "loss": 0.5927, + "mean_token_accuracy": 0.878188282251358, + "num_tokens": 415264053.0, + "step": 3883 + }, + { + "epoch": 8.848346636259977, + "grad_norm": 3.328125, + "learning_rate": 1.8016356970319116e-07, + "loss": 0.5798, + "mean_token_accuracy": 0.883014589548111, + "num_tokens": 415371061.0, + "step": 3884 + }, + { + "epoch": 8.850627137970353, + "grad_norm": 4.5625, + "learning_rate": 1.7946218553158062e-07, + "loss": 0.5621, + "mean_token_accuracy": 0.8840430676937103, + "num_tokens": 415478709.0, + "step": 3885 + }, + { + "epoch": 8.85290763968073, + "grad_norm": 3.375, + "learning_rate": 1.7876211845243325e-07, + "loss": 0.5506, + "mean_token_accuracy": 0.8877336978912354, + "num_tokens": 415586877.0, + "step": 3886 + }, + { + "epoch": 8.855188141391107, + "grad_norm": 3.03125, + "learning_rate": 1.780633688630942e-07, + "loss": 0.5946, + "mean_token_accuracy": 0.8771243989467621, + "num_tokens": 415693262.0, + "step": 3887 + }, + { + "epoch": 8.857468643101482, + "grad_norm": 3.6875, + "learning_rate": 1.773659371601605e-07, + "loss": 0.537, + "mean_token_accuracy": 0.8894240111112595, + "num_tokens": 415800803.0, + "step": 3888 + }, + { + "epoch": 8.85974914481186, + "grad_norm": 4.1875, + "learning_rate": 1.7666982373948038e-07, + "loss": 0.5917, + "mean_token_accuracy": 0.8793479204177856, + "num_tokens": 415907854.0, + "step": 3889 + }, + { + "epoch": 8.862029646522235, + "grad_norm": 4.09375, + "learning_rate": 1.7597502899615538e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.8833720535039902, + "num_tokens": 416015169.0, + "step": 3890 + }, + { + "epoch": 8.864310148232612, + "grad_norm": 5.15625, + "learning_rate": 1.752815533245364e-07, + "loss": 0.5711, + "mean_token_accuracy": 0.8827664852142334, + "num_tokens": 416122300.0, + "step": 3891 + }, + { + "epoch": 8.866590649942987, + "grad_norm": 4.4375, + "learning_rate": 1.745893971182272e-07, + "loss": 0.6053, + "mean_token_accuracy": 0.8742794394493103, + "num_tokens": 416228785.0, + "step": 3892 + }, + { + "epoch": 8.868871151653364, + "grad_norm": 2.96875, + "learning_rate": 1.7389856077008245e-07, + "loss": 0.5594, + "mean_token_accuracy": 0.8836385905742645, + "num_tokens": 416336321.0, + "step": 3893 + }, + { + "epoch": 8.87115165336374, + "grad_norm": 3.34375, + "learning_rate": 1.7320904467220762e-07, + "loss": 0.5702, + "mean_token_accuracy": 0.8823292106389999, + "num_tokens": 416443558.0, + "step": 3894 + }, + { + "epoch": 8.873432155074116, + "grad_norm": 4.59375, + "learning_rate": 1.725208492159583e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8806789815425873, + "num_tokens": 416550695.0, + "step": 3895 + }, + { + "epoch": 8.875712656784494, + "grad_norm": 3.1875, + "learning_rate": 1.7183397479194175e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8829580396413803, + "num_tokens": 416657754.0, + "step": 3896 + }, + { + "epoch": 8.877993158494869, + "grad_norm": 3.171875, + "learning_rate": 1.711484217900139e-07, + "loss": 0.5661, + "mean_token_accuracy": 0.8825534284114838, + "num_tokens": 416764992.0, + "step": 3897 + }, + { + "epoch": 8.880273660205246, + "grad_norm": 2.796875, + "learning_rate": 1.7046419059928154e-07, + "loss": 0.5671, + "mean_token_accuracy": 0.8812769651412964, + "num_tokens": 416872041.0, + "step": 3898 + }, + { + "epoch": 8.882554161915621, + "grad_norm": 3.546875, + "learning_rate": 1.6978128160810098e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8902477920055389, + "num_tokens": 416979187.0, + "step": 3899 + }, + { + "epoch": 8.884834663625998, + "grad_norm": 3.328125, + "learning_rate": 1.6909969520407854e-07, + "loss": 0.5819, + "mean_token_accuracy": 0.8812951445579529, + "num_tokens": 417086032.0, + "step": 3900 + }, + { + "epoch": 8.887115165336374, + "grad_norm": 2.765625, + "learning_rate": 1.6841943177406976e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8830399364233017, + "num_tokens": 417192794.0, + "step": 3901 + }, + { + "epoch": 8.88939566704675, + "grad_norm": 2.84375, + "learning_rate": 1.6774049170417806e-07, + "loss": 0.5702, + "mean_token_accuracy": 0.8833406120538712, + "num_tokens": 417300018.0, + "step": 3902 + }, + { + "epoch": 8.891676168757126, + "grad_norm": 2.703125, + "learning_rate": 1.6706287537975763e-07, + "loss": 0.5555, + "mean_token_accuracy": 0.8872157335281372, + "num_tokens": 417407045.0, + "step": 3903 + }, + { + "epoch": 8.893956670467503, + "grad_norm": 3.375, + "learning_rate": 1.6638658318540973e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8826020061969757, + "num_tokens": 417514079.0, + "step": 3904 + }, + { + "epoch": 8.896237172177878, + "grad_norm": 3.421875, + "learning_rate": 1.657116155049851e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.8848617970943451, + "num_tokens": 417621403.0, + "step": 3905 + }, + { + "epoch": 8.898517673888255, + "grad_norm": 3.03125, + "learning_rate": 1.6503797272158284e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8829448074102402, + "num_tokens": 417728575.0, + "step": 3906 + }, + { + "epoch": 8.900798175598633, + "grad_norm": 3.5625, + "learning_rate": 1.643656552175485e-07, + "loss": 0.5632, + "mean_token_accuracy": 0.8809054046869278, + "num_tokens": 417835651.0, + "step": 3907 + }, + { + "epoch": 8.903078677309008, + "grad_norm": 3.140625, + "learning_rate": 1.6369466337447708e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.8784161955118179, + "num_tokens": 417943079.0, + "step": 3908 + }, + { + "epoch": 8.905359179019385, + "grad_norm": 2.609375, + "learning_rate": 1.6302499757321066e-07, + "loss": 0.5859, + "mean_token_accuracy": 0.8787552714347839, + "num_tokens": 418050175.0, + "step": 3909 + }, + { + "epoch": 8.90763968072976, + "grad_norm": 3.484375, + "learning_rate": 1.623566581938385e-07, + "loss": 0.5655, + "mean_token_accuracy": 0.8856528103351593, + "num_tokens": 418157102.0, + "step": 3910 + }, + { + "epoch": 8.909920182440137, + "grad_norm": 3.5, + "learning_rate": 1.6168964561569716e-07, + "loss": 0.5756, + "mean_token_accuracy": 0.880318284034729, + "num_tokens": 418264196.0, + "step": 3911 + }, + { + "epoch": 8.912200684150513, + "grad_norm": 2.9375, + "learning_rate": 1.6102396021737077e-07, + "loss": 0.5442, + "mean_token_accuracy": 0.886517733335495, + "num_tokens": 418371298.0, + "step": 3912 + }, + { + "epoch": 8.91448118586089, + "grad_norm": 3.65625, + "learning_rate": 1.6035960237668818e-07, + "loss": 0.5621, + "mean_token_accuracy": 0.8844355195760727, + "num_tokens": 418478409.0, + "step": 3913 + }, + { + "epoch": 8.916761687571265, + "grad_norm": 2.734375, + "learning_rate": 1.5969657247072695e-07, + "loss": 0.5548, + "mean_token_accuracy": 0.8860626816749573, + "num_tokens": 418585134.0, + "step": 3914 + }, + { + "epoch": 8.919042189281642, + "grad_norm": 2.84375, + "learning_rate": 1.5903487087580994e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8894283324480057, + "num_tokens": 418691838.0, + "step": 3915 + }, + { + "epoch": 8.921322690992017, + "grad_norm": 3.0, + "learning_rate": 1.5837449796750588e-07, + "loss": 0.5838, + "mean_token_accuracy": 0.8798937052488327, + "num_tokens": 418798999.0, + "step": 3916 + }, + { + "epoch": 8.923603192702394, + "grad_norm": 2.6875, + "learning_rate": 1.577154541206305e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8867987394332886, + "num_tokens": 418906023.0, + "step": 3917 + }, + { + "epoch": 8.925883694412772, + "grad_norm": 3.453125, + "learning_rate": 1.5705773970924349e-07, + "loss": 0.5719, + "mean_token_accuracy": 0.8823074996471405, + "num_tokens": 419013460.0, + "step": 3918 + }, + { + "epoch": 8.928164196123147, + "grad_norm": 3.71875, + "learning_rate": 1.5640135510665094e-07, + "loss": 0.5883, + "mean_token_accuracy": 0.8819030374288559, + "num_tokens": 419120119.0, + "step": 3919 + }, + { + "epoch": 8.930444697833524, + "grad_norm": 3.234375, + "learning_rate": 1.5574630068540458e-07, + "loss": 0.5609, + "mean_token_accuracy": 0.8858517855405807, + "num_tokens": 419227218.0, + "step": 3920 + }, + { + "epoch": 8.9327251995439, + "grad_norm": 2.59375, + "learning_rate": 1.5509257681730034e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8868541866540909, + "num_tokens": 419334292.0, + "step": 3921 + }, + { + "epoch": 8.935005701254276, + "grad_norm": 3.96875, + "learning_rate": 1.5444018387337946e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8833621442317963, + "num_tokens": 419441056.0, + "step": 3922 + }, + { + "epoch": 8.937286202964652, + "grad_norm": 3.453125, + "learning_rate": 1.537891222239271e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8798842430114746, + "num_tokens": 419548438.0, + "step": 3923 + }, + { + "epoch": 8.939566704675029, + "grad_norm": 3.84375, + "learning_rate": 1.5313939223847384e-07, + "loss": 0.5782, + "mean_token_accuracy": 0.8819588273763657, + "num_tokens": 419655399.0, + "step": 3924 + }, + { + "epoch": 8.941847206385404, + "grad_norm": 2.578125, + "learning_rate": 1.5249099428579383e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8892662078142166, + "num_tokens": 419763644.0, + "step": 3925 + }, + { + "epoch": 8.944127708095781, + "grad_norm": 2.875, + "learning_rate": 1.5184392873390463e-07, + "loss": 0.5756, + "mean_token_accuracy": 0.8778876066207886, + "num_tokens": 419870546.0, + "step": 3926 + }, + { + "epoch": 8.946408209806158, + "grad_norm": 2.6875, + "learning_rate": 1.5119819595006857e-07, + "loss": 0.5773, + "mean_token_accuracy": 0.8791385740041733, + "num_tokens": 419977393.0, + "step": 3927 + }, + { + "epoch": 8.948688711516533, + "grad_norm": 3.3125, + "learning_rate": 1.5055379630079163e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.8777313679456711, + "num_tokens": 420084260.0, + "step": 3928 + }, + { + "epoch": 8.95096921322691, + "grad_norm": 2.734375, + "learning_rate": 1.4991073015182184e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8823111951351166, + "num_tokens": 420191156.0, + "step": 3929 + }, + { + "epoch": 8.953249714937286, + "grad_norm": 3.953125, + "learning_rate": 1.4926899786815107e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.883946105837822, + "num_tokens": 420298497.0, + "step": 3930 + }, + { + "epoch": 8.955530216647663, + "grad_norm": 2.859375, + "learning_rate": 1.4862859981401468e-07, + "loss": 0.583, + "mean_token_accuracy": 0.882349282503128, + "num_tokens": 420405474.0, + "step": 3931 + }, + { + "epoch": 8.957810718358038, + "grad_norm": 3.765625, + "learning_rate": 1.4798953635288994e-07, + "loss": 0.5589, + "mean_token_accuracy": 0.8836222589015961, + "num_tokens": 420513022.0, + "step": 3932 + }, + { + "epoch": 8.960091220068415, + "grad_norm": 2.90625, + "learning_rate": 1.4735180784749754e-07, + "loss": 0.5709, + "mean_token_accuracy": 0.880601704120636, + "num_tokens": 420620084.0, + "step": 3933 + }, + { + "epoch": 8.96237172177879, + "grad_norm": 2.859375, + "learning_rate": 1.4671541465979877e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8797517418861389, + "num_tokens": 420726931.0, + "step": 3934 + }, + { + "epoch": 8.964652223489168, + "grad_norm": 3.296875, + "learning_rate": 1.460803571509989e-07, + "loss": 0.5746, + "mean_token_accuracy": 0.8833180367946625, + "num_tokens": 420834187.0, + "step": 3935 + }, + { + "epoch": 8.966932725199545, + "grad_norm": 2.65625, + "learning_rate": 1.4544663568154427e-07, + "loss": 0.5821, + "mean_token_accuracy": 0.8844758570194244, + "num_tokens": 420940822.0, + "step": 3936 + }, + { + "epoch": 8.96921322690992, + "grad_norm": 3.125, + "learning_rate": 1.448142506111225e-07, + "loss": 0.568, + "mean_token_accuracy": 0.8841772228479385, + "num_tokens": 421048221.0, + "step": 3937 + }, + { + "epoch": 8.971493728620297, + "grad_norm": 2.609375, + "learning_rate": 1.441832022986636e-07, + "loss": 0.58, + "mean_token_accuracy": 0.8786625862121582, + "num_tokens": 421154952.0, + "step": 3938 + }, + { + "epoch": 8.973774230330672, + "grad_norm": 2.609375, + "learning_rate": 1.4355349110233868e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8868290036916733, + "num_tokens": 421261851.0, + "step": 3939 + }, + { + "epoch": 8.97605473204105, + "grad_norm": 4.1875, + "learning_rate": 1.42925117379559e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8779198080301285, + "num_tokens": 421368450.0, + "step": 3940 + }, + { + "epoch": 8.978335233751425, + "grad_norm": 2.96875, + "learning_rate": 1.4229808148697732e-07, + "loss": 0.5751, + "mean_token_accuracy": 0.8809890896081924, + "num_tokens": 421475775.0, + "step": 3941 + }, + { + "epoch": 8.980615735461802, + "grad_norm": 3.015625, + "learning_rate": 1.416723837804876e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8843148350715637, + "num_tokens": 421582954.0, + "step": 3942 + }, + { + "epoch": 8.982896237172177, + "grad_norm": 3.34375, + "learning_rate": 1.410480246152235e-07, + "loss": 0.567, + "mean_token_accuracy": 0.8814869672060013, + "num_tokens": 421690190.0, + "step": 3943 + }, + { + "epoch": 8.985176738882554, + "grad_norm": 3.546875, + "learning_rate": 1.4042500434555961e-07, + "loss": 0.5812, + "mean_token_accuracy": 0.8802603036165237, + "num_tokens": 421797293.0, + "step": 3944 + }, + { + "epoch": 8.987457240592931, + "grad_norm": 2.890625, + "learning_rate": 1.398033233251095e-07, + "loss": 0.5761, + "mean_token_accuracy": 0.8830568790435791, + "num_tokens": 421904481.0, + "step": 3945 + }, + { + "epoch": 8.989737742303307, + "grad_norm": 2.609375, + "learning_rate": 1.3918298190672806e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8889565318822861, + "num_tokens": 422011408.0, + "step": 3946 + }, + { + "epoch": 8.992018244013684, + "grad_norm": 2.90625, + "learning_rate": 1.3856398044250846e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8796057999134064, + "num_tokens": 422118456.0, + "step": 3947 + }, + { + "epoch": 8.994298745724059, + "grad_norm": 2.65625, + "learning_rate": 1.3794631928378434e-07, + "loss": 0.5586, + "mean_token_accuracy": 0.8840108662843704, + "num_tokens": 422225963.0, + "step": 3948 + }, + { + "epoch": 8.996579247434436, + "grad_norm": 2.515625, + "learning_rate": 1.3732999878112856e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.8850581645965576, + "num_tokens": 422333254.0, + "step": 3949 + }, + { + "epoch": 8.998859749144811, + "grad_norm": 2.890625, + "learning_rate": 1.3671501928435193e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.8808369934558868, + "num_tokens": 422440753.0, + "step": 3950 + }, + { + "epoch": 9.0, + "grad_norm": 9.0, + "learning_rate": 1.361013811425052e-07, + "loss": 0.5721, + "mean_token_accuracy": 0.8868737518787384, + "num_tokens": 422480088.0, + "step": 3951 + }, + { + "epoch": 9.002280501710377, + "grad_norm": 3.015625, + "learning_rate": 1.3548908470387783e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.8844682276248932, + "num_tokens": 422587127.0, + "step": 3952 + }, + { + "epoch": 9.004561003420752, + "grad_norm": 2.421875, + "learning_rate": 1.348781303159974e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8936333805322647, + "num_tokens": 422694591.0, + "step": 3953 + }, + { + "epoch": 9.00684150513113, + "grad_norm": 3.234375, + "learning_rate": 1.3426851832562982e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.8864496648311615, + "num_tokens": 422802195.0, + "step": 3954 + }, + { + "epoch": 9.009122006841505, + "grad_norm": 2.5, + "learning_rate": 1.3366024907877917e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.8846766799688339, + "num_tokens": 422909976.0, + "step": 3955 + }, + { + "epoch": 9.011402508551882, + "grad_norm": 2.921875, + "learning_rate": 1.3305332292068706e-07, + "loss": 0.5821, + "mean_token_accuracy": 0.8805938214063644, + "num_tokens": 423016740.0, + "step": 3956 + }, + { + "epoch": 9.013683010262257, + "grad_norm": 3.546875, + "learning_rate": 1.3244774019583296e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.8852843195199966, + "num_tokens": 423123757.0, + "step": 3957 + }, + { + "epoch": 9.015963511972634, + "grad_norm": 3.0625, + "learning_rate": 1.318435012479341e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8831017911434174, + "num_tokens": 423229906.0, + "step": 3958 + }, + { + "epoch": 9.01824401368301, + "grad_norm": 2.8125, + "learning_rate": 1.3124060641994507e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.88319331407547, + "num_tokens": 423337912.0, + "step": 3959 + }, + { + "epoch": 9.020524515393387, + "grad_norm": 3.421875, + "learning_rate": 1.306390560540577e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.8833757936954498, + "num_tokens": 423445320.0, + "step": 3960 + }, + { + "epoch": 9.020524515393387, + "eval_loss": 0.5863176584243774, + "eval_mean_token_accuracy": 0.8799216114976107, + "eval_num_tokens": 423445320.0, + "eval_runtime": 58.6917, + "eval_samples_per_second": 142.865, + "eval_steps_per_second": 4.481, + "step": 3960 + }, + { + "epoch": 9.022805017103764, + "grad_norm": 3.03125, + "learning_rate": 1.300388504916991e-07, + "loss": 0.5857, + "mean_token_accuracy": 0.8808793723583221, + "num_tokens": 423551972.0, + "step": 3961 + }, + { + "epoch": 9.025085518814139, + "grad_norm": 3.375, + "learning_rate": 1.2943999007353518e-07, + "loss": 0.5633, + "mean_token_accuracy": 0.8862560987472534, + "num_tokens": 423658903.0, + "step": 3962 + }, + { + "epoch": 9.027366020524516, + "grad_norm": 3.6875, + "learning_rate": 1.2884247513946761e-07, + "loss": 0.5818, + "mean_token_accuracy": 0.8841415643692017, + "num_tokens": 423765920.0, + "step": 3963 + }, + { + "epoch": 9.029646522234891, + "grad_norm": 2.796875, + "learning_rate": 1.2824630602863402e-07, + "loss": 0.5812, + "mean_token_accuracy": 0.8800481855869293, + "num_tokens": 423872709.0, + "step": 3964 + }, + { + "epoch": 9.031927023945268, + "grad_norm": 2.75, + "learning_rate": 1.2765148307940927e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8826144337654114, + "num_tokens": 423979876.0, + "step": 3965 + }, + { + "epoch": 9.034207525655644, + "grad_norm": 4.09375, + "learning_rate": 1.270580066294022e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8837986141443253, + "num_tokens": 424086963.0, + "step": 3966 + }, + { + "epoch": 9.03648802736602, + "grad_norm": 3.5625, + "learning_rate": 1.264658770154592e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8831162303686142, + "num_tokens": 424193595.0, + "step": 3967 + }, + { + "epoch": 9.038768529076396, + "grad_norm": 3.78125, + "learning_rate": 1.258750945736617e-07, + "loss": 0.5552, + "mean_token_accuracy": 0.8876445442438126, + "num_tokens": 424300328.0, + "step": 3968 + }, + { + "epoch": 9.041049030786773, + "grad_norm": 3.46875, + "learning_rate": 1.252856596393262e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8856015801429749, + "num_tokens": 424407500.0, + "step": 3969 + }, + { + "epoch": 9.043329532497149, + "grad_norm": 2.75, + "learning_rate": 1.2469757254700454e-07, + "loss": 0.5624, + "mean_token_accuracy": 0.8857114911079407, + "num_tokens": 424515230.0, + "step": 3970 + }, + { + "epoch": 9.045610034207526, + "grad_norm": 2.828125, + "learning_rate": 1.2411083363048386e-07, + "loss": 0.5901, + "mean_token_accuracy": 0.878177598118782, + "num_tokens": 424622085.0, + "step": 3971 + }, + { + "epoch": 9.047890535917903, + "grad_norm": 2.921875, + "learning_rate": 1.2352544322278558e-07, + "loss": 0.5995, + "mean_token_accuracy": 0.8785828799009323, + "num_tokens": 424728586.0, + "step": 3972 + }, + { + "epoch": 9.050171037628278, + "grad_norm": 3.109375, + "learning_rate": 1.2294140165616613e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.883171334862709, + "num_tokens": 424835689.0, + "step": 3973 + }, + { + "epoch": 9.052451539338655, + "grad_norm": 2.6875, + "learning_rate": 1.223587092621162e-07, + "loss": 0.5783, + "mean_token_accuracy": 0.880466416478157, + "num_tokens": 424942400.0, + "step": 3974 + }, + { + "epoch": 9.05473204104903, + "grad_norm": 3.921875, + "learning_rate": 1.2177736637136063e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8840536028146744, + "num_tokens": 425049454.0, + "step": 3975 + }, + { + "epoch": 9.057012542759407, + "grad_norm": 4.125, + "learning_rate": 1.2119737331385885e-07, + "loss": 0.5884, + "mean_token_accuracy": 0.8803068101406097, + "num_tokens": 425156346.0, + "step": 3976 + }, + { + "epoch": 9.059293044469783, + "grad_norm": 3.734375, + "learning_rate": 1.2061873041880335e-07, + "loss": 0.5798, + "mean_token_accuracy": 0.8801010698080063, + "num_tokens": 425262768.0, + "step": 3977 + }, + { + "epoch": 9.06157354618016, + "grad_norm": 2.546875, + "learning_rate": 1.200414380146206e-07, + "loss": 0.5709, + "mean_token_accuracy": 0.882665142416954, + "num_tokens": 425370275.0, + "step": 3978 + }, + { + "epoch": 9.063854047890535, + "grad_norm": 3.203125, + "learning_rate": 1.1946549642897043e-07, + "loss": 0.5668, + "mean_token_accuracy": 0.8818689733743668, + "num_tokens": 425477099.0, + "step": 3979 + }, + { + "epoch": 9.066134549600912, + "grad_norm": 3.125, + "learning_rate": 1.1889090598874692e-07, + "loss": 0.5815, + "mean_token_accuracy": 0.8815028220415115, + "num_tokens": 425584752.0, + "step": 3980 + }, + { + "epoch": 9.06841505131129, + "grad_norm": 4.25, + "learning_rate": 1.1831766702007613e-07, + "loss": 0.5884, + "mean_token_accuracy": 0.8803604692220688, + "num_tokens": 425691333.0, + "step": 3981 + }, + { + "epoch": 9.070695553021665, + "grad_norm": 3.0, + "learning_rate": 1.1774577984831725e-07, + "loss": 0.5801, + "mean_token_accuracy": 0.8821363896131516, + "num_tokens": 425798165.0, + "step": 3982 + }, + { + "epoch": 9.072976054732042, + "grad_norm": 3.484375, + "learning_rate": 1.1717524479806231e-07, + "loss": 0.578, + "mean_token_accuracy": 0.8794075697660446, + "num_tokens": 425904837.0, + "step": 3983 + }, + { + "epoch": 9.075256556442417, + "grad_norm": 3.046875, + "learning_rate": 1.1660606219313642e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8835705667734146, + "num_tokens": 426012137.0, + "step": 3984 + }, + { + "epoch": 9.077537058152794, + "grad_norm": 3.5625, + "learning_rate": 1.1603823235659644e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8850951492786407, + "num_tokens": 426119123.0, + "step": 3985 + }, + { + "epoch": 9.07981755986317, + "grad_norm": 2.78125, + "learning_rate": 1.1547175561073154e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8812219202518463, + "num_tokens": 426226278.0, + "step": 3986 + }, + { + "epoch": 9.082098061573546, + "grad_norm": 2.953125, + "learning_rate": 1.1490663227706311e-07, + "loss": 0.5866, + "mean_token_accuracy": 0.88066066801548, + "num_tokens": 426333015.0, + "step": 3987 + }, + { + "epoch": 9.084378563283922, + "grad_norm": 2.484375, + "learning_rate": 1.1434286267634432e-07, + "loss": 0.5532, + "mean_token_accuracy": 0.8882465213537216, + "num_tokens": 426440373.0, + "step": 3988 + }, + { + "epoch": 9.086659064994299, + "grad_norm": 3.28125, + "learning_rate": 1.1378044712855946e-07, + "loss": 0.6018, + "mean_token_accuracy": 0.8752111494541168, + "num_tokens": 426546913.0, + "step": 3989 + }, + { + "epoch": 9.088939566704674, + "grad_norm": 2.96875, + "learning_rate": 1.1321938595292542e-07, + "loss": 0.568, + "mean_token_accuracy": 0.8806653469800949, + "num_tokens": 426654053.0, + "step": 3990 + }, + { + "epoch": 9.091220068415051, + "grad_norm": 3.015625, + "learning_rate": 1.1265967946788913e-07, + "loss": 0.5653, + "mean_token_accuracy": 0.8836180865764618, + "num_tokens": 426761077.0, + "step": 3991 + }, + { + "epoch": 9.093500570125428, + "grad_norm": 3.671875, + "learning_rate": 1.1210132799112954e-07, + "loss": 0.5426, + "mean_token_accuracy": 0.8866991996765137, + "num_tokens": 426868400.0, + "step": 3992 + }, + { + "epoch": 9.095781071835804, + "grad_norm": 2.734375, + "learning_rate": 1.1154433183955593e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.880575105547905, + "num_tokens": 426975681.0, + "step": 3993 + }, + { + "epoch": 9.09806157354618, + "grad_norm": 3.03125, + "learning_rate": 1.1098869132930846e-07, + "loss": 0.5781, + "mean_token_accuracy": 0.880921483039856, + "num_tokens": 427082443.0, + "step": 3994 + }, + { + "epoch": 9.100342075256556, + "grad_norm": 2.71875, + "learning_rate": 1.1043440677575818e-07, + "loss": 0.5642, + "mean_token_accuracy": 0.88336580991745, + "num_tokens": 427189040.0, + "step": 3995 + }, + { + "epoch": 9.102622576966933, + "grad_norm": 3.015625, + "learning_rate": 1.0988147849350623e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8802250772714615, + "num_tokens": 427295944.0, + "step": 3996 + }, + { + "epoch": 9.104903078677308, + "grad_norm": 2.46875, + "learning_rate": 1.0932990679638406e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.8847872316837311, + "num_tokens": 427402191.0, + "step": 3997 + }, + { + "epoch": 9.107183580387685, + "grad_norm": 3.5, + "learning_rate": 1.0877969199745347e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.8834807723760605, + "num_tokens": 427509134.0, + "step": 3998 + }, + { + "epoch": 9.10946408209806, + "grad_norm": 3.078125, + "learning_rate": 1.0823083440900523e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8807051777839661, + "num_tokens": 427616092.0, + "step": 3999 + }, + { + "epoch": 9.111744583808438, + "grad_norm": 3.578125, + "learning_rate": 1.0768333434256039e-07, + "loss": 0.587, + "mean_token_accuracy": 0.8833329081535339, + "num_tokens": 427723041.0, + "step": 4000 + }, + { + "epoch": 9.114025085518815, + "grad_norm": 2.765625, + "learning_rate": 1.071371921088693e-07, + "loss": 0.5673, + "mean_token_accuracy": 0.8830773681402206, + "num_tokens": 427830278.0, + "step": 4001 + }, + { + "epoch": 9.11630558722919, + "grad_norm": 2.828125, + "learning_rate": 1.0659240801791204e-07, + "loss": 0.575, + "mean_token_accuracy": 0.8835585117340088, + "num_tokens": 427937452.0, + "step": 4002 + }, + { + "epoch": 9.118586088939567, + "grad_norm": 3.125, + "learning_rate": 1.0604898237889794e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.8840138912200928, + "num_tokens": 428044476.0, + "step": 4003 + }, + { + "epoch": 9.120866590649943, + "grad_norm": 3.984375, + "learning_rate": 1.0550691550026415e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8799526989459991, + "num_tokens": 428151794.0, + "step": 4004 + }, + { + "epoch": 9.12314709236032, + "grad_norm": 3.6875, + "learning_rate": 1.0496620768967736e-07, + "loss": 0.5874, + "mean_token_accuracy": 0.8784212470054626, + "num_tokens": 428258124.0, + "step": 4005 + }, + { + "epoch": 9.125427594070695, + "grad_norm": 3.859375, + "learning_rate": 1.0442685925403346e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8846355378627777, + "num_tokens": 428365889.0, + "step": 4006 + }, + { + "epoch": 9.127708095781072, + "grad_norm": 4.125, + "learning_rate": 1.0388887049945589e-07, + "loss": 0.5837, + "mean_token_accuracy": 0.8810495138168335, + "num_tokens": 428472825.0, + "step": 4007 + }, + { + "epoch": 9.129988597491447, + "grad_norm": 2.8125, + "learning_rate": 1.0335224173129683e-07, + "loss": 0.5729, + "mean_token_accuracy": 0.8812502026557922, + "num_tokens": 428580741.0, + "step": 4008 + }, + { + "epoch": 9.132269099201825, + "grad_norm": 3.046875, + "learning_rate": 1.0281697325413593e-07, + "loss": 0.5508, + "mean_token_accuracy": 0.8845276087522507, + "num_tokens": 428687964.0, + "step": 4009 + }, + { + "epoch": 9.134549600912202, + "grad_norm": 3.078125, + "learning_rate": 1.0228306537178185e-07, + "loss": 0.5828, + "mean_token_accuracy": 0.8836958706378937, + "num_tokens": 428794803.0, + "step": 4010 + }, + { + "epoch": 9.136830102622577, + "grad_norm": 2.859375, + "learning_rate": 1.0175051838727023e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8883339315652847, + "num_tokens": 428901676.0, + "step": 4011 + }, + { + "epoch": 9.139110604332954, + "grad_norm": 3.046875, + "learning_rate": 1.0121933260286432e-07, + "loss": 0.5539, + "mean_token_accuracy": 0.8851373344659805, + "num_tokens": 429009258.0, + "step": 4012 + }, + { + "epoch": 9.14139110604333, + "grad_norm": 3.296875, + "learning_rate": 1.0068950832005487e-07, + "loss": 0.57, + "mean_token_accuracy": 0.8837276846170425, + "num_tokens": 429116708.0, + "step": 4013 + }, + { + "epoch": 9.143671607753706, + "grad_norm": 3.0625, + "learning_rate": 1.0016104583956021e-07, + "loss": 0.5865, + "mean_token_accuracy": 0.8805951774120331, + "num_tokens": 429223799.0, + "step": 4014 + }, + { + "epoch": 9.145952109464082, + "grad_norm": 3.6875, + "learning_rate": 9.963394546132488e-08, + "loss": 0.5714, + "mean_token_accuracy": 0.8829237967729568, + "num_tokens": 429330842.0, + "step": 4015 + }, + { + "epoch": 9.148232611174459, + "grad_norm": 2.625, + "learning_rate": 9.91082074845215e-08, + "loss": 0.5576, + "mean_token_accuracy": 0.8858160525560379, + "num_tokens": 429438548.0, + "step": 4016 + }, + { + "epoch": 9.150513112884834, + "grad_norm": 3.0625, + "learning_rate": 9.85838322075483e-08, + "loss": 0.5708, + "mean_token_accuracy": 0.8821554481983185, + "num_tokens": 429545699.0, + "step": 4017 + }, + { + "epoch": 9.152793614595211, + "grad_norm": 3.78125, + "learning_rate": 9.806081992803084e-08, + "loss": 0.5884, + "mean_token_accuracy": 0.8780340850353241, + "num_tokens": 429653689.0, + "step": 4018 + }, + { + "epoch": 9.155074116305586, + "grad_norm": 3.03125, + "learning_rate": 9.753917094282112e-08, + "loss": 0.5746, + "mean_token_accuracy": 0.8829029351472855, + "num_tokens": 429759979.0, + "step": 4019 + }, + { + "epoch": 9.157354618015964, + "grad_norm": 2.46875, + "learning_rate": 9.701888554799643e-08, + "loss": 0.5683, + "mean_token_accuracy": 0.883645236492157, + "num_tokens": 429867491.0, + "step": 4020 + }, + { + "epoch": 9.15963511972634, + "grad_norm": 2.953125, + "learning_rate": 9.649996403886086e-08, + "loss": 0.5653, + "mean_token_accuracy": 0.8832023441791534, + "num_tokens": 429974470.0, + "step": 4021 + }, + { + "epoch": 9.161915621436716, + "grad_norm": 3.0625, + "learning_rate": 9.598240670994435e-08, + "loss": 0.5901, + "mean_token_accuracy": 0.8796355575323105, + "num_tokens": 430081389.0, + "step": 4022 + }, + { + "epoch": 9.164196123147093, + "grad_norm": 3.359375, + "learning_rate": 9.546621385500249e-08, + "loss": 0.5847, + "mean_token_accuracy": 0.8781373351812363, + "num_tokens": 430188309.0, + "step": 4023 + }, + { + "epoch": 9.166476624857468, + "grad_norm": 2.921875, + "learning_rate": 9.495138576701673e-08, + "loss": 0.5693, + "mean_token_accuracy": 0.883067861199379, + "num_tokens": 430295602.0, + "step": 4024 + }, + { + "epoch": 9.168757126567845, + "grad_norm": 3.078125, + "learning_rate": 9.443792273819252e-08, + "loss": 0.5593, + "mean_token_accuracy": 0.8858152031898499, + "num_tokens": 430402252.0, + "step": 4025 + }, + { + "epoch": 9.17103762827822, + "grad_norm": 2.734375, + "learning_rate": 9.392582505996256e-08, + "loss": 0.5787, + "mean_token_accuracy": 0.8808012902736664, + "num_tokens": 430508615.0, + "step": 4026 + }, + { + "epoch": 9.173318129988598, + "grad_norm": 3.4375, + "learning_rate": 9.341509302298295e-08, + "loss": 0.5636, + "mean_token_accuracy": 0.8854473978281021, + "num_tokens": 430615447.0, + "step": 4027 + }, + { + "epoch": 9.175598631698973, + "grad_norm": 2.703125, + "learning_rate": 9.290572691713573e-08, + "loss": 0.5635, + "mean_token_accuracy": 0.885080024600029, + "num_tokens": 430722374.0, + "step": 4028 + }, + { + "epoch": 9.17787913340935, + "grad_norm": 2.609375, + "learning_rate": 9.23977270315271e-08, + "loss": 0.5819, + "mean_token_accuracy": 0.8824837505817413, + "num_tokens": 430829818.0, + "step": 4029 + }, + { + "epoch": 9.180159635119727, + "grad_norm": 2.640625, + "learning_rate": 9.18910936544884e-08, + "loss": 0.548, + "mean_token_accuracy": 0.8877168446779251, + "num_tokens": 430936968.0, + "step": 4030 + }, + { + "epoch": 9.182440136830103, + "grad_norm": 3.21875, + "learning_rate": 9.138582707357429e-08, + "loss": 0.5572, + "mean_token_accuracy": 0.8826006203889847, + "num_tokens": 431043889.0, + "step": 4031 + }, + { + "epoch": 9.18472063854048, + "grad_norm": 3.96875, + "learning_rate": 9.088192757556457e-08, + "loss": 0.576, + "mean_token_accuracy": 0.8780194073915482, + "num_tokens": 431150431.0, + "step": 4032 + }, + { + "epoch": 9.187001140250855, + "grad_norm": 2.984375, + "learning_rate": 9.037939544646324e-08, + "loss": 0.551, + "mean_token_accuracy": 0.8834515959024429, + "num_tokens": 431257816.0, + "step": 4033 + }, + { + "epoch": 9.189281641961232, + "grad_norm": 3.046875, + "learning_rate": 8.987823097149739e-08, + "loss": 0.5718, + "mean_token_accuracy": 0.8835292160511017, + "num_tokens": 431364325.0, + "step": 4034 + }, + { + "epoch": 9.191562143671607, + "grad_norm": 3.0, + "learning_rate": 8.93784344351184e-08, + "loss": 0.5672, + "mean_token_accuracy": 0.8845600485801697, + "num_tokens": 431471086.0, + "step": 4035 + }, + { + "epoch": 9.193842645381984, + "grad_norm": 3.0, + "learning_rate": 8.888000612100128e-08, + "loss": 0.5694, + "mean_token_accuracy": 0.8847863525152206, + "num_tokens": 431578776.0, + "step": 4036 + }, + { + "epoch": 9.19612314709236, + "grad_norm": 2.875, + "learning_rate": 8.838294631204391e-08, + "loss": 0.5679, + "mean_token_accuracy": 0.8858127593994141, + "num_tokens": 431685845.0, + "step": 4037 + }, + { + "epoch": 9.198403648802737, + "grad_norm": 3.21875, + "learning_rate": 8.788725529036812e-08, + "loss": 0.5587, + "mean_token_accuracy": 0.885520726442337, + "num_tokens": 431793922.0, + "step": 4038 + }, + { + "epoch": 9.200684150513112, + "grad_norm": 3.109375, + "learning_rate": 8.739293333731886e-08, + "loss": 0.5813, + "mean_token_accuracy": 0.8812271952629089, + "num_tokens": 431900233.0, + "step": 4039 + }, + { + "epoch": 9.20296465222349, + "grad_norm": 2.75, + "learning_rate": 8.689998073346361e-08, + "loss": 0.5602, + "mean_token_accuracy": 0.8870201855897903, + "num_tokens": 432007117.0, + "step": 4040 + }, + { + "epoch": 9.205245153933866, + "grad_norm": 3.25, + "learning_rate": 8.640839775859222e-08, + "loss": 0.5591, + "mean_token_accuracy": 0.8841793239116669, + "num_tokens": 432114160.0, + "step": 4041 + }, + { + "epoch": 9.207525655644242, + "grad_norm": 2.75, + "learning_rate": 8.591818469171815e-08, + "loss": 0.5726, + "mean_token_accuracy": 0.8814292848110199, + "num_tokens": 432221294.0, + "step": 4042 + }, + { + "epoch": 9.209806157354619, + "grad_norm": 3.078125, + "learning_rate": 8.542934181107687e-08, + "loss": 0.5693, + "mean_token_accuracy": 0.8811886161565781, + "num_tokens": 432328223.0, + "step": 4043 + }, + { + "epoch": 9.212086659064994, + "grad_norm": 3.390625, + "learning_rate": 8.494186939412591e-08, + "loss": 0.5692, + "mean_token_accuracy": 0.8848684132099152, + "num_tokens": 432435810.0, + "step": 4044 + }, + { + "epoch": 9.214367160775371, + "grad_norm": 2.984375, + "learning_rate": 8.44557677175456e-08, + "loss": 0.5732, + "mean_token_accuracy": 0.8800568580627441, + "num_tokens": 432542846.0, + "step": 4045 + }, + { + "epoch": 9.216647662485746, + "grad_norm": 2.5625, + "learning_rate": 8.397103705723774e-08, + "loss": 0.5613, + "mean_token_accuracy": 0.8849961012601852, + "num_tokens": 432649987.0, + "step": 4046 + }, + { + "epoch": 9.218928164196123, + "grad_norm": 2.8125, + "learning_rate": 8.348767768832561e-08, + "loss": 0.566, + "mean_token_accuracy": 0.8844825327396393, + "num_tokens": 432756682.0, + "step": 4047 + }, + { + "epoch": 9.221208665906499, + "grad_norm": 3.015625, + "learning_rate": 8.300568988515529e-08, + "loss": 0.5756, + "mean_token_accuracy": 0.8798407763242722, + "num_tokens": 432863598.0, + "step": 4048 + }, + { + "epoch": 9.223489167616876, + "grad_norm": 3.703125, + "learning_rate": 8.25250739212935e-08, + "loss": 0.5694, + "mean_token_accuracy": 0.8815117180347443, + "num_tokens": 432970619.0, + "step": 4049 + }, + { + "epoch": 9.225769669327253, + "grad_norm": 2.96875, + "learning_rate": 8.204583006952843e-08, + "loss": 0.5811, + "mean_token_accuracy": 0.88057541847229, + "num_tokens": 433077375.0, + "step": 4050 + }, + { + "epoch": 9.228050171037628, + "grad_norm": 2.890625, + "learning_rate": 8.156795860187028e-08, + "loss": 0.5483, + "mean_token_accuracy": 0.8875467032194138, + "num_tokens": 433184582.0, + "step": 4051 + }, + { + "epoch": 9.230330672748005, + "grad_norm": 2.65625, + "learning_rate": 8.109145978954874e-08, + "loss": 0.582, + "mean_token_accuracy": 0.8819152861833572, + "num_tokens": 433291314.0, + "step": 4052 + }, + { + "epoch": 9.23261117445838, + "grad_norm": 2.75, + "learning_rate": 8.061633390301582e-08, + "loss": 0.5711, + "mean_token_accuracy": 0.8808294236660004, + "num_tokens": 433398910.0, + "step": 4053 + }, + { + "epoch": 9.234891676168758, + "grad_norm": 3.03125, + "learning_rate": 8.014258121194385e-08, + "loss": 0.5633, + "mean_token_accuracy": 0.885577842593193, + "num_tokens": 433505751.0, + "step": 4054 + }, + { + "epoch": 9.237172177879133, + "grad_norm": 3.984375, + "learning_rate": 7.967020198522579e-08, + "loss": 0.5536, + "mean_token_accuracy": 0.885611966252327, + "num_tokens": 433613189.0, + "step": 4055 + }, + { + "epoch": 9.23945267958951, + "grad_norm": 2.3125, + "learning_rate": 7.91991964909744e-08, + "loss": 0.5444, + "mean_token_accuracy": 0.8886383771896362, + "num_tokens": 433720725.0, + "step": 4056 + }, + { + "epoch": 9.241733181299885, + "grad_norm": 3.0625, + "learning_rate": 7.872956499652418e-08, + "loss": 0.5946, + "mean_token_accuracy": 0.8744540363550186, + "num_tokens": 433827682.0, + "step": 4057 + }, + { + "epoch": 9.244013683010262, + "grad_norm": 3.0625, + "learning_rate": 7.826130776842828e-08, + "loss": 0.5958, + "mean_token_accuracy": 0.8749952912330627, + "num_tokens": 433934460.0, + "step": 4058 + }, + { + "epoch": 9.246294184720638, + "grad_norm": 3.375, + "learning_rate": 7.779442507246021e-08, + "loss": 0.5542, + "mean_token_accuracy": 0.8850563615560532, + "num_tokens": 434041663.0, + "step": 4059 + }, + { + "epoch": 9.248574686431015, + "grad_norm": 3.5625, + "learning_rate": 7.73289171736144e-08, + "loss": 0.5755, + "mean_token_accuracy": 0.882745549082756, + "num_tokens": 434147970.0, + "step": 4060 + }, + { + "epoch": 9.250855188141392, + "grad_norm": 3.21875, + "learning_rate": 7.686478433610339e-08, + "loss": 0.5937, + "mean_token_accuracy": 0.8776877820491791, + "num_tokens": 434255682.0, + "step": 4061 + }, + { + "epoch": 9.253135689851767, + "grad_norm": 2.96875, + "learning_rate": 7.64020268233609e-08, + "loss": 0.5453, + "mean_token_accuracy": 0.8867078274488449, + "num_tokens": 434362871.0, + "step": 4062 + }, + { + "epoch": 9.255416191562144, + "grad_norm": 3.140625, + "learning_rate": 7.594064489803821e-08, + "loss": 0.5594, + "mean_token_accuracy": 0.8832486718893051, + "num_tokens": 434470335.0, + "step": 4063 + }, + { + "epoch": 9.25769669327252, + "grad_norm": 3.015625, + "learning_rate": 7.548063882200724e-08, + "loss": 0.5691, + "mean_token_accuracy": 0.8802042752504349, + "num_tokens": 434577136.0, + "step": 4064 + }, + { + "epoch": 9.259977194982897, + "grad_norm": 3.609375, + "learning_rate": 7.502200885635858e-08, + "loss": 0.5797, + "mean_token_accuracy": 0.8827760517597198, + "num_tokens": 434684066.0, + "step": 4065 + }, + { + "epoch": 9.262257696693272, + "grad_norm": 5.09375, + "learning_rate": 7.45647552614015e-08, + "loss": 0.5649, + "mean_token_accuracy": 0.8822989910840988, + "num_tokens": 434791074.0, + "step": 4066 + }, + { + "epoch": 9.264538198403649, + "grad_norm": 3.1875, + "learning_rate": 7.410887829666479e-08, + "loss": 0.5713, + "mean_token_accuracy": 0.8809485137462616, + "num_tokens": 434897922.0, + "step": 4067 + }, + { + "epoch": 9.266818700114024, + "grad_norm": 2.65625, + "learning_rate": 7.365437822089482e-08, + "loss": 0.5737, + "mean_token_accuracy": 0.8806920945644379, + "num_tokens": 435005339.0, + "step": 4068 + }, + { + "epoch": 9.269099201824401, + "grad_norm": 3.03125, + "learning_rate": 7.320125529205746e-08, + "loss": 0.5583, + "mean_token_accuracy": 0.8838251531124115, + "num_tokens": 435112853.0, + "step": 4069 + }, + { + "epoch": 9.271379703534778, + "grad_norm": 3.265625, + "learning_rate": 7.274950976733642e-08, + "loss": 0.5781, + "mean_token_accuracy": 0.8822825849056244, + "num_tokens": 435220066.0, + "step": 4070 + }, + { + "epoch": 9.273660205245154, + "grad_norm": 2.875, + "learning_rate": 7.22991419031338e-08, + "loss": 0.5787, + "mean_token_accuracy": 0.8838555067777634, + "num_tokens": 435327142.0, + "step": 4071 + }, + { + "epoch": 9.27594070695553, + "grad_norm": 2.796875, + "learning_rate": 7.185015195506961e-08, + "loss": 0.5666, + "mean_token_accuracy": 0.8847849667072296, + "num_tokens": 435433996.0, + "step": 4072 + }, + { + "epoch": 9.278221208665906, + "grad_norm": 3.46875, + "learning_rate": 7.140254017798221e-08, + "loss": 0.5752, + "mean_token_accuracy": 0.8813326507806778, + "num_tokens": 435541275.0, + "step": 4073 + }, + { + "epoch": 9.280501710376283, + "grad_norm": 3.40625, + "learning_rate": 7.095630682592669e-08, + "loss": 0.5792, + "mean_token_accuracy": 0.8786799758672714, + "num_tokens": 435648369.0, + "step": 4074 + }, + { + "epoch": 9.282782212086659, + "grad_norm": 2.578125, + "learning_rate": 7.051145215217715e-08, + "loss": 0.5671, + "mean_token_accuracy": 0.8824814110994339, + "num_tokens": 435755124.0, + "step": 4075 + }, + { + "epoch": 9.285062713797036, + "grad_norm": 3.484375, + "learning_rate": 7.006797640922436e-08, + "loss": 0.5746, + "mean_token_accuracy": 0.8815957754850388, + "num_tokens": 435861525.0, + "step": 4076 + }, + { + "epoch": 9.287343215507411, + "grad_norm": 3.015625, + "learning_rate": 6.962587984877617e-08, + "loss": 0.5816, + "mean_token_accuracy": 0.8820008486509323, + "num_tokens": 435968074.0, + "step": 4077 + }, + { + "epoch": 9.289623717217788, + "grad_norm": 3.4375, + "learning_rate": 6.918516272175879e-08, + "loss": 0.5769, + "mean_token_accuracy": 0.8813347816467285, + "num_tokens": 436075305.0, + "step": 4078 + }, + { + "epoch": 9.291904218928163, + "grad_norm": 2.78125, + "learning_rate": 6.874582527831409e-08, + "loss": 0.5631, + "mean_token_accuracy": 0.8813544809818268, + "num_tokens": 436183200.0, + "step": 4079 + }, + { + "epoch": 9.29418472063854, + "grad_norm": 3.203125, + "learning_rate": 6.830786776780174e-08, + "loss": 0.607, + "mean_token_accuracy": 0.8745275288820267, + "num_tokens": 436289903.0, + "step": 4080 + }, + { + "epoch": 9.296465222348917, + "grad_norm": 2.953125, + "learning_rate": 6.78712904387982e-08, + "loss": 0.5851, + "mean_token_accuracy": 0.8781695067882538, + "num_tokens": 436396884.0, + "step": 4081 + }, + { + "epoch": 9.298745724059293, + "grad_norm": 3.15625, + "learning_rate": 6.74360935390958e-08, + "loss": 0.5561, + "mean_token_accuracy": 0.8871065676212311, + "num_tokens": 436504033.0, + "step": 4082 + }, + { + "epoch": 9.30102622576967, + "grad_norm": 7.09375, + "learning_rate": 6.700227731570475e-08, + "loss": 0.5711, + "mean_token_accuracy": 0.8803330808877945, + "num_tokens": 436610843.0, + "step": 4083 + }, + { + "epoch": 9.303306727480045, + "grad_norm": 2.734375, + "learning_rate": 6.656984201485001e-08, + "loss": 0.5725, + "mean_token_accuracy": 0.8825246691703796, + "num_tokens": 436717813.0, + "step": 4084 + }, + { + "epoch": 9.305587229190422, + "grad_norm": 3.40625, + "learning_rate": 6.613878788197359e-08, + "loss": 0.5887, + "mean_token_accuracy": 0.8762739151716232, + "num_tokens": 436824381.0, + "step": 4085 + }, + { + "epoch": 9.307867730900798, + "grad_norm": 2.46875, + "learning_rate": 6.570911516173368e-08, + "loss": 0.5568, + "mean_token_accuracy": 0.8874360471963882, + "num_tokens": 436931568.0, + "step": 4086 + }, + { + "epoch": 9.310148232611175, + "grad_norm": 3.828125, + "learning_rate": 6.528082409800434e-08, + "loss": 0.5697, + "mean_token_accuracy": 0.8826966434717178, + "num_tokens": 437038163.0, + "step": 4087 + }, + { + "epoch": 9.31242873432155, + "grad_norm": 2.578125, + "learning_rate": 6.485391493387505e-08, + "loss": 0.5684, + "mean_token_accuracy": 0.883079007267952, + "num_tokens": 437145250.0, + "step": 4088 + }, + { + "epoch": 9.314709236031927, + "grad_norm": 3.25, + "learning_rate": 6.442838791165168e-08, + "loss": 0.5927, + "mean_token_accuracy": 0.8812314420938492, + "num_tokens": 437251824.0, + "step": 4089 + }, + { + "epoch": 9.316989737742304, + "grad_norm": 3.578125, + "learning_rate": 6.400424327285437e-08, + "loss": 0.581, + "mean_token_accuracy": 0.8754829615354538, + "num_tokens": 437358958.0, + "step": 4090 + }, + { + "epoch": 9.31927023945268, + "grad_norm": 2.703125, + "learning_rate": 6.358148125822e-08, + "loss": 0.5529, + "mean_token_accuracy": 0.8846077919006348, + "num_tokens": 437466352.0, + "step": 4091 + }, + { + "epoch": 9.321550741163056, + "grad_norm": 2.96875, + "learning_rate": 6.316010210769997e-08, + "loss": 0.5987, + "mean_token_accuracy": 0.8769486397504807, + "num_tokens": 437573435.0, + "step": 4092 + }, + { + "epoch": 9.323831242873432, + "grad_norm": 2.78125, + "learning_rate": 6.274010606046071e-08, + "loss": 0.5611, + "mean_token_accuracy": 0.8839032351970673, + "num_tokens": 437680600.0, + "step": 4093 + }, + { + "epoch": 9.326111744583809, + "grad_norm": 2.796875, + "learning_rate": 6.232149335488463e-08, + "loss": 0.572, + "mean_token_accuracy": 0.8846787661314011, + "num_tokens": 437787269.0, + "step": 4094 + }, + { + "epoch": 9.328392246294184, + "grad_norm": 3.84375, + "learning_rate": 6.190426422856749e-08, + "loss": 0.5708, + "mean_token_accuracy": 0.8827256411314011, + "num_tokens": 437893943.0, + "step": 4095 + }, + { + "epoch": 9.330672748004561, + "grad_norm": 2.65625, + "learning_rate": 6.148841891832069e-08, + "loss": 0.5731, + "mean_token_accuracy": 0.8799492716789246, + "num_tokens": 438000911.0, + "step": 4096 + }, + { + "epoch": 9.332953249714937, + "grad_norm": 2.796875, + "learning_rate": 6.107395766016988e-08, + "loss": 0.5613, + "mean_token_accuracy": 0.8834062963724136, + "num_tokens": 438107441.0, + "step": 4097 + }, + { + "epoch": 9.335233751425314, + "grad_norm": 2.5625, + "learning_rate": 6.066088068935577e-08, + "loss": 0.5703, + "mean_token_accuracy": 0.8836114853620529, + "num_tokens": 438214716.0, + "step": 4098 + }, + { + "epoch": 9.33751425313569, + "grad_norm": 2.671875, + "learning_rate": 6.024918824033221e-08, + "loss": 0.5945, + "mean_token_accuracy": 0.878153920173645, + "num_tokens": 438320941.0, + "step": 4099 + }, + { + "epoch": 9.339794754846066, + "grad_norm": 2.921875, + "learning_rate": 5.983888054676867e-08, + "loss": 0.5657, + "mean_token_accuracy": 0.8838547617197037, + "num_tokens": 438428175.0, + "step": 4100 + }, + { + "epoch": 9.342075256556443, + "grad_norm": 3.296875, + "learning_rate": 5.9429957841546926e-08, + "loss": 0.5726, + "mean_token_accuracy": 0.8802156448364258, + "num_tokens": 438534915.0, + "step": 4101 + }, + { + "epoch": 9.344355758266818, + "grad_norm": 5.0625, + "learning_rate": 5.902242035676409e-08, + "loss": 0.563, + "mean_token_accuracy": 0.8827090561389923, + "num_tokens": 438641893.0, + "step": 4102 + }, + { + "epoch": 9.346636259977195, + "grad_norm": 2.984375, + "learning_rate": 5.8616268323730685e-08, + "loss": 0.5755, + "mean_token_accuracy": 0.8797716945409775, + "num_tokens": 438749068.0, + "step": 4103 + }, + { + "epoch": 9.34891676168757, + "grad_norm": 5.15625, + "learning_rate": 5.821150197297038e-08, + "loss": 0.5804, + "mean_token_accuracy": 0.878428652882576, + "num_tokens": 438855850.0, + "step": 4104 + }, + { + "epoch": 9.351197263397948, + "grad_norm": 2.859375, + "learning_rate": 5.780812153422161e-08, + "loss": 0.5663, + "mean_token_accuracy": 0.8853023201227188, + "num_tokens": 438962709.0, + "step": 4105 + }, + { + "epoch": 9.353477765108323, + "grad_norm": 4.53125, + "learning_rate": 5.7406127236434016e-08, + "loss": 0.5649, + "mean_token_accuracy": 0.8817013502120972, + "num_tokens": 439069382.0, + "step": 4106 + }, + { + "epoch": 9.3557582668187, + "grad_norm": 4.15625, + "learning_rate": 5.700551930777287e-08, + "loss": 0.5525, + "mean_token_accuracy": 0.8876905292272568, + "num_tokens": 439176711.0, + "step": 4107 + }, + { + "epoch": 9.358038768529076, + "grad_norm": 2.765625, + "learning_rate": 5.66062979756149e-08, + "loss": 0.568, + "mean_token_accuracy": 0.8845077753067017, + "num_tokens": 439284100.0, + "step": 4108 + }, + { + "epoch": 9.360319270239453, + "grad_norm": 3.28125, + "learning_rate": 5.620846346655079e-08, + "loss": 0.5581, + "mean_token_accuracy": 0.8825329095125198, + "num_tokens": 439391460.0, + "step": 4109 + }, + { + "epoch": 9.36259977194983, + "grad_norm": 2.765625, + "learning_rate": 5.5812016006383805e-08, + "loss": 0.5723, + "mean_token_accuracy": 0.8812670260667801, + "num_tokens": 439498912.0, + "step": 4110 + }, + { + "epoch": 9.364880273660205, + "grad_norm": 2.828125, + "learning_rate": 5.5416955820129515e-08, + "loss": 0.5616, + "mean_token_accuracy": 0.8810815811157227, + "num_tokens": 439606318.0, + "step": 4111 + }, + { + "epoch": 9.367160775370582, + "grad_norm": 3.1875, + "learning_rate": 5.50232831320166e-08, + "loss": 0.5562, + "mean_token_accuracy": 0.8831993341445923, + "num_tokens": 439712921.0, + "step": 4112 + }, + { + "epoch": 9.369441277080957, + "grad_norm": 2.8125, + "learning_rate": 5.463099816548578e-08, + "loss": 0.5788, + "mean_token_accuracy": 0.8801111429929733, + "num_tokens": 439819707.0, + "step": 4113 + }, + { + "epoch": 9.371721778791335, + "grad_norm": 3.015625, + "learning_rate": 5.424010114319117e-08, + "loss": 0.5637, + "mean_token_accuracy": 0.8822034150362015, + "num_tokens": 439926343.0, + "step": 4114 + }, + { + "epoch": 9.37400228050171, + "grad_norm": 3.84375, + "learning_rate": 5.385059228699779e-08, + "loss": 0.5728, + "mean_token_accuracy": 0.8807101249694824, + "num_tokens": 440033050.0, + "step": 4115 + }, + { + "epoch": 9.376282782212087, + "grad_norm": 3.078125, + "learning_rate": 5.346247181798325e-08, + "loss": 0.5855, + "mean_token_accuracy": 0.8771592527627945, + "num_tokens": 440140455.0, + "step": 4116 + }, + { + "epoch": 9.378563283922462, + "grad_norm": 3.015625, + "learning_rate": 5.307573995643772e-08, + "loss": 0.5551, + "mean_token_accuracy": 0.8888624608516693, + "num_tokens": 440247555.0, + "step": 4117 + }, + { + "epoch": 9.38084378563284, + "grad_norm": 2.890625, + "learning_rate": 5.2690396921862284e-08, + "loss": 0.5652, + "mean_token_accuracy": 0.8854438215494156, + "num_tokens": 440354832.0, + "step": 4118 + }, + { + "epoch": 9.383124287343216, + "grad_norm": 2.828125, + "learning_rate": 5.230644293297088e-08, + "loss": 0.5617, + "mean_token_accuracy": 0.8868270665407181, + "num_tokens": 440462224.0, + "step": 4119 + }, + { + "epoch": 9.385404789053592, + "grad_norm": 2.953125, + "learning_rate": 5.192387820768752e-08, + "loss": 0.5614, + "mean_token_accuracy": 0.8861428201198578, + "num_tokens": 440569106.0, + "step": 4120 + }, + { + "epoch": 9.387685290763969, + "grad_norm": 3.515625, + "learning_rate": 5.154270296314878e-08, + "loss": 0.5661, + "mean_token_accuracy": 0.8851838260889053, + "num_tokens": 440676323.0, + "step": 4121 + }, + { + "epoch": 9.389965792474344, + "grad_norm": 2.796875, + "learning_rate": 5.116291741570301e-08, + "loss": 0.5632, + "mean_token_accuracy": 0.8838883936405182, + "num_tokens": 440784286.0, + "step": 4122 + }, + { + "epoch": 9.392246294184721, + "grad_norm": 2.921875, + "learning_rate": 5.078452178090831e-08, + "loss": 0.561, + "mean_token_accuracy": 0.8820521384477615, + "num_tokens": 440891324.0, + "step": 4123 + }, + { + "epoch": 9.394526795895096, + "grad_norm": 2.5625, + "learning_rate": 5.040751627353513e-08, + "loss": 0.5779, + "mean_token_accuracy": 0.8817588239908218, + "num_tokens": 440998383.0, + "step": 4124 + }, + { + "epoch": 9.396807297605474, + "grad_norm": 2.65625, + "learning_rate": 5.003190110756451e-08, + "loss": 0.5587, + "mean_token_accuracy": 0.8854601830244064, + "num_tokens": 441105621.0, + "step": 4125 + }, + { + "epoch": 9.399087799315849, + "grad_norm": 2.953125, + "learning_rate": 4.965767649618869e-08, + "loss": 0.5863, + "mean_token_accuracy": 0.8789174407720566, + "num_tokens": 441212053.0, + "step": 4126 + }, + { + "epoch": 9.401368301026226, + "grad_norm": 2.625, + "learning_rate": 4.928484265180972e-08, + "loss": 0.5602, + "mean_token_accuracy": 0.8848675191402435, + "num_tokens": 441318500.0, + "step": 4127 + }, + { + "epoch": 9.403648802736601, + "grad_norm": 2.8125, + "learning_rate": 4.8913399786041097e-08, + "loss": 0.591, + "mean_token_accuracy": 0.8787374198436737, + "num_tokens": 441425061.0, + "step": 4128 + }, + { + "epoch": 9.405929304446978, + "grad_norm": 2.875, + "learning_rate": 4.854334810970668e-08, + "loss": 0.5563, + "mean_token_accuracy": 0.8844193369150162, + "num_tokens": 441531921.0, + "step": 4129 + }, + { + "epoch": 9.408209806157355, + "grad_norm": 3.171875, + "learning_rate": 4.817468783284096e-08, + "loss": 0.5782, + "mean_token_accuracy": 0.8798136711120605, + "num_tokens": 441639273.0, + "step": 4130 + }, + { + "epoch": 9.41049030786773, + "grad_norm": 2.625, + "learning_rate": 4.7807419164687673e-08, + "loss": 0.5543, + "mean_token_accuracy": 0.8829266577959061, + "num_tokens": 441745920.0, + "step": 4131 + }, + { + "epoch": 9.412770809578108, + "grad_norm": 6.03125, + "learning_rate": 4.7441542313702293e-08, + "loss": 0.5991, + "mean_token_accuracy": 0.8739556968212128, + "num_tokens": 441852952.0, + "step": 4132 + }, + { + "epoch": 9.415051311288483, + "grad_norm": 3.09375, + "learning_rate": 4.707705748754898e-08, + "loss": 0.5619, + "mean_token_accuracy": 0.8853324800729752, + "num_tokens": 441960257.0, + "step": 4133 + }, + { + "epoch": 9.41733181299886, + "grad_norm": 3.734375, + "learning_rate": 4.671396489310198e-08, + "loss": 0.5689, + "mean_token_accuracy": 0.8818874061107635, + "num_tokens": 442067515.0, + "step": 4134 + }, + { + "epoch": 9.419612314709235, + "grad_norm": 4.375, + "learning_rate": 4.635226473644616e-08, + "loss": 0.5711, + "mean_token_accuracy": 0.8827540129423141, + "num_tokens": 442174146.0, + "step": 4135 + }, + { + "epoch": 9.421892816419613, + "grad_norm": 3.046875, + "learning_rate": 4.599195722287536e-08, + "loss": 0.5512, + "mean_token_accuracy": 0.8860030323266983, + "num_tokens": 442281613.0, + "step": 4136 + }, + { + "epoch": 9.424173318129988, + "grad_norm": 2.6875, + "learning_rate": 4.5633042556893493e-08, + "loss": 0.5853, + "mean_token_accuracy": 0.8759856522083282, + "num_tokens": 442388874.0, + "step": 4137 + }, + { + "epoch": 9.426453819840365, + "grad_norm": 2.765625, + "learning_rate": 4.527552094221288e-08, + "loss": 0.5799, + "mean_token_accuracy": 0.8808791786432266, + "num_tokens": 442495992.0, + "step": 4138 + }, + { + "epoch": 9.428734321550742, + "grad_norm": 2.953125, + "learning_rate": 4.4919392581756204e-08, + "loss": 0.5786, + "mean_token_accuracy": 0.8829675912857056, + "num_tokens": 442603136.0, + "step": 4139 + }, + { + "epoch": 9.431014823261117, + "grad_norm": 3.4375, + "learning_rate": 4.456465767765539e-08, + "loss": 0.5803, + "mean_token_accuracy": 0.8808294236660004, + "num_tokens": 442710669.0, + "step": 4140 + }, + { + "epoch": 9.433295324971494, + "grad_norm": 3.265625, + "learning_rate": 4.421131643125104e-08, + "loss": 0.5673, + "mean_token_accuracy": 0.8814819753170013, + "num_tokens": 442817853.0, + "step": 4141 + }, + { + "epoch": 9.43557582668187, + "grad_norm": 2.53125, + "learning_rate": 4.3859369043092183e-08, + "loss": 0.558, + "mean_token_accuracy": 0.8864561766386032, + "num_tokens": 442925142.0, + "step": 4142 + }, + { + "epoch": 9.437856328392247, + "grad_norm": 3.34375, + "learning_rate": 4.350881571293819e-08, + "loss": 0.5963, + "mean_token_accuracy": 0.8762660771608353, + "num_tokens": 443032724.0, + "step": 4143 + }, + { + "epoch": 9.440136830102622, + "grad_norm": 2.984375, + "learning_rate": 4.315965663975602e-08, + "loss": 0.5754, + "mean_token_accuracy": 0.8806591182947159, + "num_tokens": 443140265.0, + "step": 4144 + }, + { + "epoch": 9.442417331813, + "grad_norm": 3.296875, + "learning_rate": 4.281189202172131e-08, + "loss": 0.5769, + "mean_token_accuracy": 0.8811807930469513, + "num_tokens": 443247055.0, + "step": 4145 + }, + { + "epoch": 9.444697833523374, + "grad_norm": 2.734375, + "learning_rate": 4.246552205621896e-08, + "loss": 0.5625, + "mean_token_accuracy": 0.8876753896474838, + "num_tokens": 443353977.0, + "step": 4146 + }, + { + "epoch": 9.446978335233752, + "grad_norm": 2.703125, + "learning_rate": 4.212054693984169e-08, + "loss": 0.5843, + "mean_token_accuracy": 0.8812119662761688, + "num_tokens": 443460573.0, + "step": 4147 + }, + { + "epoch": 9.449258836944129, + "grad_norm": 2.859375, + "learning_rate": 4.177696686839094e-08, + "loss": 0.593, + "mean_token_accuracy": 0.8792661875486374, + "num_tokens": 443567499.0, + "step": 4148 + }, + { + "epoch": 9.451539338654504, + "grad_norm": 2.96875, + "learning_rate": 4.143478203687573e-08, + "loss": 0.5428, + "mean_token_accuracy": 0.8875003904104233, + "num_tokens": 443674500.0, + "step": 4149 + }, + { + "epoch": 9.453819840364881, + "grad_norm": 2.46875, + "learning_rate": 4.1093992639514026e-08, + "loss": 0.5456, + "mean_token_accuracy": 0.8857799172401428, + "num_tokens": 443781405.0, + "step": 4150 + }, + { + "epoch": 9.456100342075256, + "grad_norm": 2.71875, + "learning_rate": 4.0754598869730824e-08, + "loss": 0.5592, + "mean_token_accuracy": 0.8854545056819916, + "num_tokens": 443888679.0, + "step": 4151 + }, + { + "epoch": 9.458380843785633, + "grad_norm": 3.15625, + "learning_rate": 4.041660092015981e-08, + "loss": 0.5611, + "mean_token_accuracy": 0.8850967884063721, + "num_tokens": 443996366.0, + "step": 4152 + }, + { + "epoch": 9.460661345496009, + "grad_norm": 3.234375, + "learning_rate": 4.007999898264225e-08, + "loss": 0.596, + "mean_token_accuracy": 0.8776397556066513, + "num_tokens": 444104029.0, + "step": 4153 + }, + { + "epoch": 9.462941847206386, + "grad_norm": 3.203125, + "learning_rate": 3.9744793248226446e-08, + "loss": 0.5659, + "mean_token_accuracy": 0.8829027712345123, + "num_tokens": 444211032.0, + "step": 4154 + }, + { + "epoch": 9.465222348916761, + "grad_norm": 3.84375, + "learning_rate": 3.9410983907169076e-08, + "loss": 0.5758, + "mean_token_accuracy": 0.8814502954483032, + "num_tokens": 444317977.0, + "step": 4155 + }, + { + "epoch": 9.467502850627138, + "grad_norm": 4.15625, + "learning_rate": 3.90785711489336e-08, + "loss": 0.594, + "mean_token_accuracy": 0.8752471357584, + "num_tokens": 444424810.0, + "step": 4156 + }, + { + "epoch": 9.469783352337513, + "grad_norm": 2.734375, + "learning_rate": 3.874755516219103e-08, + "loss": 0.5697, + "mean_token_accuracy": 0.8843945115804672, + "num_tokens": 444532283.0, + "step": 4157 + }, + { + "epoch": 9.47206385404789, + "grad_norm": 2.828125, + "learning_rate": 3.8417936134820255e-08, + "loss": 0.5483, + "mean_token_accuracy": 0.8853261023759842, + "num_tokens": 444639261.0, + "step": 4158 + }, + { + "epoch": 9.474344355758268, + "grad_norm": 2.796875, + "learning_rate": 3.808971425390606e-08, + "loss": 0.5861, + "mean_token_accuracy": 0.8793308734893799, + "num_tokens": 444745716.0, + "step": 4159 + }, + { + "epoch": 9.476624857468643, + "grad_norm": 4.1875, + "learning_rate": 3.7762889705740824e-08, + "loss": 0.5629, + "mean_token_accuracy": 0.8815562427043915, + "num_tokens": 444852792.0, + "step": 4160 + }, + { + "epoch": 9.47890535917902, + "grad_norm": 2.828125, + "learning_rate": 3.743746267582421e-08, + "loss": 0.5619, + "mean_token_accuracy": 0.8823586702346802, + "num_tokens": 444960631.0, + "step": 4161 + }, + { + "epoch": 9.481185860889395, + "grad_norm": 3.65625, + "learning_rate": 3.711343334886236e-08, + "loss": 0.5929, + "mean_token_accuracy": 0.8763528019189835, + "num_tokens": 445067005.0, + "step": 4162 + }, + { + "epoch": 9.483466362599772, + "grad_norm": 5.28125, + "learning_rate": 3.679080190876788e-08, + "loss": 0.558, + "mean_token_accuracy": 0.8835884630680084, + "num_tokens": 445174337.0, + "step": 4163 + }, + { + "epoch": 9.485746864310148, + "grad_norm": 2.875, + "learning_rate": 3.646956853865985e-08, + "loss": 0.5779, + "mean_token_accuracy": 0.8791737705469131, + "num_tokens": 445281162.0, + "step": 4164 + }, + { + "epoch": 9.488027366020525, + "grad_norm": 2.953125, + "learning_rate": 3.614973342086464e-08, + "loss": 0.5696, + "mean_token_accuracy": 0.8816744834184647, + "num_tokens": 445388038.0, + "step": 4165 + }, + { + "epoch": 9.4903078677309, + "grad_norm": 3.75, + "learning_rate": 3.583129673691427e-08, + "loss": 0.5796, + "mean_token_accuracy": 0.8827396035194397, + "num_tokens": 445494974.0, + "step": 4166 + }, + { + "epoch": 9.492588369441277, + "grad_norm": 3.96875, + "learning_rate": 3.551425866754693e-08, + "loss": 0.5716, + "mean_token_accuracy": 0.8809922337532043, + "num_tokens": 445602110.0, + "step": 4167 + }, + { + "epoch": 9.494868871151652, + "grad_norm": 3.0625, + "learning_rate": 3.519861939270786e-08, + "loss": 0.5779, + "mean_token_accuracy": 0.8822459131479263, + "num_tokens": 445709330.0, + "step": 4168 + }, + { + "epoch": 9.49714937286203, + "grad_norm": 2.578125, + "learning_rate": 3.4884379091547905e-08, + "loss": 0.59, + "mean_token_accuracy": 0.8781626224517822, + "num_tokens": 445816251.0, + "step": 4169 + }, + { + "epoch": 9.499429874572407, + "grad_norm": 3.703125, + "learning_rate": 3.457153794242302e-08, + "loss": 0.581, + "mean_token_accuracy": 0.8822884410619736, + "num_tokens": 445923058.0, + "step": 4170 + }, + { + "epoch": 9.501710376282782, + "grad_norm": 2.734375, + "learning_rate": 3.4260096122896435e-08, + "loss": 0.5643, + "mean_token_accuracy": 0.8818854689598083, + "num_tokens": 446030612.0, + "step": 4171 + }, + { + "epoch": 9.503990877993159, + "grad_norm": 2.71875, + "learning_rate": 3.3950053809736204e-08, + "loss": 0.5733, + "mean_token_accuracy": 0.8821684867143631, + "num_tokens": 446137524.0, + "step": 4172 + }, + { + "epoch": 9.506271379703534, + "grad_norm": 3.046875, + "learning_rate": 3.364141117891656e-08, + "loss": 0.5467, + "mean_token_accuracy": 0.8861701488494873, + "num_tokens": 446245159.0, + "step": 4173 + }, + { + "epoch": 9.508551881413911, + "grad_norm": 4.3125, + "learning_rate": 3.333416840561709e-08, + "loss": 0.5643, + "mean_token_accuracy": 0.8825655877590179, + "num_tokens": 446351813.0, + "step": 4174 + }, + { + "epoch": 9.510832383124287, + "grad_norm": 2.890625, + "learning_rate": 3.302832566422276e-08, + "loss": 0.5637, + "mean_token_accuracy": 0.8839205503463745, + "num_tokens": 446459491.0, + "step": 4175 + }, + { + "epoch": 9.513112884834664, + "grad_norm": 3.109375, + "learning_rate": 3.272388312832414e-08, + "loss": 0.5609, + "mean_token_accuracy": 0.88397616147995, + "num_tokens": 446566325.0, + "step": 4176 + }, + { + "epoch": 9.515393386545039, + "grad_norm": 2.8125, + "learning_rate": 3.242084097071663e-08, + "loss": 0.5719, + "mean_token_accuracy": 0.8808989524841309, + "num_tokens": 446672682.0, + "step": 4177 + }, + { + "epoch": 9.517673888255416, + "grad_norm": 2.578125, + "learning_rate": 3.211919936340152e-08, + "loss": 0.5851, + "mean_token_accuracy": 0.8773170709609985, + "num_tokens": 446779847.0, + "step": 4178 + }, + { + "epoch": 9.519954389965793, + "grad_norm": 3.4375, + "learning_rate": 3.1818958477584375e-08, + "loss": 0.562, + "mean_token_accuracy": 0.8823383450508118, + "num_tokens": 446886733.0, + "step": 4179 + }, + { + "epoch": 9.522234891676169, + "grad_norm": 2.78125, + "learning_rate": 3.152011848367664e-08, + "loss": 0.5527, + "mean_token_accuracy": 0.8837304264307022, + "num_tokens": 446994279.0, + "step": 4180 + }, + { + "epoch": 9.522234891676169, + "eval_loss": 0.5863840579986572, + "eval_mean_token_accuracy": 0.8800425250720615, + "eval_num_tokens": 446994279.0, + "eval_runtime": 58.647, + "eval_samples_per_second": 142.974, + "eval_steps_per_second": 4.484, + "step": 4180 + }, + { + "epoch": 9.524515393386546, + "grad_norm": 3.328125, + "learning_rate": 3.1222679551293486e-08, + "loss": 0.5806, + "mean_token_accuracy": 0.878794476389885, + "num_tokens": 447100998.0, + "step": 4181 + }, + { + "epoch": 9.526795895096921, + "grad_norm": 3.6875, + "learning_rate": 3.0926641849255976e-08, + "loss": 0.5735, + "mean_token_accuracy": 0.8829492926597595, + "num_tokens": 447208281.0, + "step": 4182 + }, + { + "epoch": 9.529076396807298, + "grad_norm": 3.09375, + "learning_rate": 3.063200554558915e-08, + "loss": 0.5751, + "mean_token_accuracy": 0.8793532252311707, + "num_tokens": 447315119.0, + "step": 4183 + }, + { + "epoch": 9.531356898517673, + "grad_norm": 2.453125, + "learning_rate": 3.033877080752312e-08, + "loss": 0.5559, + "mean_token_accuracy": 0.8861254155635834, + "num_tokens": 447422365.0, + "step": 4184 + }, + { + "epoch": 9.53363740022805, + "grad_norm": 2.875, + "learning_rate": 3.0046937801491983e-08, + "loss": 0.5595, + "mean_token_accuracy": 0.8834296315908432, + "num_tokens": 447529666.0, + "step": 4185 + }, + { + "epoch": 9.535917901938426, + "grad_norm": 2.921875, + "learning_rate": 2.97565066931349e-08, + "loss": 0.5897, + "mean_token_accuracy": 0.8798715174198151, + "num_tokens": 447636941.0, + "step": 4186 + }, + { + "epoch": 9.538198403648803, + "grad_norm": 3.984375, + "learning_rate": 2.9467477647294464e-08, + "loss": 0.5894, + "mean_token_accuracy": 0.8795375227928162, + "num_tokens": 447744498.0, + "step": 4187 + }, + { + "epoch": 9.54047890535918, + "grad_norm": 3.9375, + "learning_rate": 2.917985082801833e-08, + "loss": 0.5647, + "mean_token_accuracy": 0.8826041370630264, + "num_tokens": 447852156.0, + "step": 4188 + }, + { + "epoch": 9.542759407069555, + "grad_norm": 2.9375, + "learning_rate": 2.8893626398557583e-08, + "loss": 0.5683, + "mean_token_accuracy": 0.8823677599430084, + "num_tokens": 447958991.0, + "step": 4189 + }, + { + "epoch": 9.545039908779932, + "grad_norm": 3.71875, + "learning_rate": 2.8608804521368382e-08, + "loss": 0.5586, + "mean_token_accuracy": 0.8871615529060364, + "num_tokens": 448066219.0, + "step": 4190 + }, + { + "epoch": 9.547320410490308, + "grad_norm": 3.640625, + "learning_rate": 2.832538535810947e-08, + "loss": 0.5837, + "mean_token_accuracy": 0.8768136501312256, + "num_tokens": 448173466.0, + "step": 4191 + }, + { + "epoch": 9.549600912200685, + "grad_norm": 3.1875, + "learning_rate": 2.804336906964439e-08, + "loss": 0.5775, + "mean_token_accuracy": 0.8800145536661148, + "num_tokens": 448280207.0, + "step": 4192 + }, + { + "epoch": 9.55188141391106, + "grad_norm": 4.375, + "learning_rate": 2.7762755816039823e-08, + "loss": 0.5778, + "mean_token_accuracy": 0.8806869983673096, + "num_tokens": 448387602.0, + "step": 4193 + }, + { + "epoch": 9.554161915621437, + "grad_norm": 3.578125, + "learning_rate": 2.74835457565667e-08, + "loss": 0.5452, + "mean_token_accuracy": 0.8861921578645706, + "num_tokens": 448495042.0, + "step": 4194 + }, + { + "epoch": 9.556442417331812, + "grad_norm": 2.796875, + "learning_rate": 2.7205739049699365e-08, + "loss": 0.5467, + "mean_token_accuracy": 0.8862817734479904, + "num_tokens": 448603055.0, + "step": 4195 + }, + { + "epoch": 9.55872291904219, + "grad_norm": 3.75, + "learning_rate": 2.6929335853115302e-08, + "loss": 0.5855, + "mean_token_accuracy": 0.878138080239296, + "num_tokens": 448710105.0, + "step": 4196 + }, + { + "epoch": 9.561003420752566, + "grad_norm": 2.640625, + "learning_rate": 2.6654336323695963e-08, + "loss": 0.5668, + "mean_token_accuracy": 0.8829863220453262, + "num_tokens": 448817588.0, + "step": 4197 + }, + { + "epoch": 9.563283922462942, + "grad_norm": 2.828125, + "learning_rate": 2.63807406175251e-08, + "loss": 0.5701, + "mean_token_accuracy": 0.8825019598007202, + "num_tokens": 448924426.0, + "step": 4198 + }, + { + "epoch": 9.565564424173319, + "grad_norm": 3.1875, + "learning_rate": 2.6108548889891005e-08, + "loss": 0.5751, + "mean_token_accuracy": 0.8802784979343414, + "num_tokens": 449032265.0, + "step": 4199 + }, + { + "epoch": 9.567844925883694, + "grad_norm": 3.03125, + "learning_rate": 2.5837761295284258e-08, + "loss": 0.5914, + "mean_token_accuracy": 0.878703162074089, + "num_tokens": 449139772.0, + "step": 4200 + }, + { + "epoch": 9.570125427594071, + "grad_norm": 3.515625, + "learning_rate": 2.5568377987398862e-08, + "loss": 0.5538, + "mean_token_accuracy": 0.8867183774709702, + "num_tokens": 449247187.0, + "step": 4201 + }, + { + "epoch": 9.572405929304447, + "grad_norm": 2.578125, + "learning_rate": 2.5300399119131124e-08, + "loss": 0.5641, + "mean_token_accuracy": 0.8858496695756912, + "num_tokens": 449354632.0, + "step": 4202 + }, + { + "epoch": 9.574686431014824, + "grad_norm": 3.375, + "learning_rate": 2.5033824842581046e-08, + "loss": 0.5552, + "mean_token_accuracy": 0.8847160637378693, + "num_tokens": 449461642.0, + "step": 4203 + }, + { + "epoch": 9.576966932725199, + "grad_norm": 3.328125, + "learning_rate": 2.476865530905065e-08, + "loss": 0.5726, + "mean_token_accuracy": 0.8837704807519913, + "num_tokens": 449567936.0, + "step": 4204 + }, + { + "epoch": 9.579247434435576, + "grad_norm": 2.65625, + "learning_rate": 2.4504890669045654e-08, + "loss": 0.5684, + "mean_token_accuracy": 0.8814296871423721, + "num_tokens": 449674620.0, + "step": 4205 + }, + { + "epoch": 9.581527936145951, + "grad_norm": 3.0, + "learning_rate": 2.4242531072273255e-08, + "loss": 0.5677, + "mean_token_accuracy": 0.8843848407268524, + "num_tokens": 449782103.0, + "step": 4206 + }, + { + "epoch": 9.583808437856328, + "grad_norm": 2.484375, + "learning_rate": 2.398157666764378e-08, + "loss": 0.5678, + "mean_token_accuracy": 0.883353129029274, + "num_tokens": 449888924.0, + "step": 4207 + }, + { + "epoch": 9.586088939566705, + "grad_norm": 2.875, + "learning_rate": 2.3722027603270415e-08, + "loss": 0.5702, + "mean_token_accuracy": 0.8861001282930374, + "num_tokens": 449996145.0, + "step": 4208 + }, + { + "epoch": 9.58836944127708, + "grad_norm": 3.109375, + "learning_rate": 2.3463884026467265e-08, + "loss": 0.585, + "mean_token_accuracy": 0.8813659995794296, + "num_tokens": 450103843.0, + "step": 4209 + }, + { + "epoch": 9.590649942987458, + "grad_norm": 2.703125, + "learning_rate": 2.320714608375241e-08, + "loss": 0.5616, + "mean_token_accuracy": 0.8853833377361298, + "num_tokens": 450210552.0, + "step": 4210 + }, + { + "epoch": 9.592930444697833, + "grad_norm": 4.96875, + "learning_rate": 2.295181392084511e-08, + "loss": 0.5994, + "mean_token_accuracy": 0.877122089266777, + "num_tokens": 450318286.0, + "step": 4211 + }, + { + "epoch": 9.59521094640821, + "grad_norm": 3.765625, + "learning_rate": 2.269788768266695e-08, + "loss": 0.5508, + "mean_token_accuracy": 0.8865731358528137, + "num_tokens": 450425852.0, + "step": 4212 + }, + { + "epoch": 9.597491448118586, + "grad_norm": 3.0, + "learning_rate": 2.2445367513341533e-08, + "loss": 0.5923, + "mean_token_accuracy": 0.8766883164644241, + "num_tokens": 450532753.0, + "step": 4213 + }, + { + "epoch": 9.599771949828963, + "grad_norm": 2.953125, + "learning_rate": 2.21942535561942e-08, + "loss": 0.5868, + "mean_token_accuracy": 0.8800808787345886, + "num_tokens": 450639888.0, + "step": 4214 + }, + { + "epoch": 9.602052451539338, + "grad_norm": 3.765625, + "learning_rate": 2.1944545953752894e-08, + "loss": 0.5824, + "mean_token_accuracy": 0.8837506324052811, + "num_tokens": 450746949.0, + "step": 4215 + }, + { + "epoch": 9.604332953249715, + "grad_norm": 3.265625, + "learning_rate": 2.1696244847746737e-08, + "loss": 0.574, + "mean_token_accuracy": 0.8841430097818375, + "num_tokens": 450853966.0, + "step": 4216 + }, + { + "epoch": 9.60661345496009, + "grad_norm": 2.734375, + "learning_rate": 2.1449350379106336e-08, + "loss": 0.5536, + "mean_token_accuracy": 0.8856358528137207, + "num_tokens": 450961525.0, + "step": 4217 + }, + { + "epoch": 9.608893956670467, + "grad_norm": 3.140625, + "learning_rate": 2.1203862687964595e-08, + "loss": 0.5718, + "mean_token_accuracy": 0.8838754594326019, + "num_tokens": 451068191.0, + "step": 4218 + }, + { + "epoch": 9.611174458380844, + "grad_norm": 4.28125, + "learning_rate": 2.0959781913655053e-08, + "loss": 0.5791, + "mean_token_accuracy": 0.8790152668952942, + "num_tokens": 451175232.0, + "step": 4219 + }, + { + "epoch": 9.61345496009122, + "grad_norm": 3.078125, + "learning_rate": 2.0717108194713566e-08, + "loss": 0.5641, + "mean_token_accuracy": 0.8844882100820541, + "num_tokens": 451282602.0, + "step": 4220 + }, + { + "epoch": 9.615735461801597, + "grad_norm": 2.953125, + "learning_rate": 2.0475841668877172e-08, + "loss": 0.5797, + "mean_token_accuracy": 0.8819600045681, + "num_tokens": 451389683.0, + "step": 4221 + }, + { + "epoch": 9.618015963511972, + "grad_norm": 3.078125, + "learning_rate": 2.0235982473084115e-08, + "loss": 0.5573, + "mean_token_accuracy": 0.8830082267522812, + "num_tokens": 451496555.0, + "step": 4222 + }, + { + "epoch": 9.62029646522235, + "grad_norm": 3.140625, + "learning_rate": 1.9997530743473548e-08, + "loss": 0.5728, + "mean_token_accuracy": 0.8831875026226044, + "num_tokens": 451603138.0, + "step": 4223 + }, + { + "epoch": 9.622576966932725, + "grad_norm": 3.28125, + "learning_rate": 1.9760486615386376e-08, + "loss": 0.5686, + "mean_token_accuracy": 0.8828965127468109, + "num_tokens": 451709899.0, + "step": 4224 + }, + { + "epoch": 9.624857468643102, + "grad_norm": 3.078125, + "learning_rate": 1.9524850223363868e-08, + "loss": 0.5722, + "mean_token_accuracy": 0.8803802877664566, + "num_tokens": 451817093.0, + "step": 4225 + }, + { + "epoch": 9.627137970353477, + "grad_norm": 2.90625, + "learning_rate": 1.9290621701149315e-08, + "loss": 0.5728, + "mean_token_accuracy": 0.883967936038971, + "num_tokens": 451924424.0, + "step": 4226 + }, + { + "epoch": 9.629418472063854, + "grad_norm": 3.9375, + "learning_rate": 1.905780118168582e-08, + "loss": 0.5787, + "mean_token_accuracy": 0.8782493621110916, + "num_tokens": 452031788.0, + "step": 4227 + }, + { + "epoch": 9.631698973774231, + "grad_norm": 3.625, + "learning_rate": 1.882638879711768e-08, + "loss": 0.5915, + "mean_token_accuracy": 0.8770202100276947, + "num_tokens": 452139174.0, + "step": 4228 + }, + { + "epoch": 9.633979475484606, + "grad_norm": 2.640625, + "learning_rate": 1.859638467879038e-08, + "loss": 0.5595, + "mean_token_accuracy": 0.8836124539375305, + "num_tokens": 452246865.0, + "step": 4229 + }, + { + "epoch": 9.636259977194984, + "grad_norm": 2.859375, + "learning_rate": 1.8367788957250054e-08, + "loss": 0.56, + "mean_token_accuracy": 0.8824215233325958, + "num_tokens": 452354042.0, + "step": 4230 + }, + { + "epoch": 9.638540478905359, + "grad_norm": 2.765625, + "learning_rate": 1.8140601762242916e-08, + "loss": 0.5793, + "mean_token_accuracy": 0.8815779983997345, + "num_tokens": 452461206.0, + "step": 4231 + }, + { + "epoch": 9.640820980615736, + "grad_norm": 3.625, + "learning_rate": 1.7914823222715817e-08, + "loss": 0.5824, + "mean_token_accuracy": 0.8769263476133347, + "num_tokens": 452567894.0, + "step": 4232 + }, + { + "epoch": 9.643101482326111, + "grad_norm": 2.890625, + "learning_rate": 1.7690453466816805e-08, + "loss": 0.5695, + "mean_token_accuracy": 0.8843720555305481, + "num_tokens": 452675400.0, + "step": 4233 + }, + { + "epoch": 9.645381984036488, + "grad_norm": 3.0625, + "learning_rate": 1.7467492621893457e-08, + "loss": 0.5697, + "mean_token_accuracy": 0.8844276815652847, + "num_tokens": 452782870.0, + "step": 4234 + }, + { + "epoch": 9.647662485746864, + "grad_norm": 3.078125, + "learning_rate": 1.724594081449399e-08, + "loss": 0.5705, + "mean_token_accuracy": 0.8851383030414581, + "num_tokens": 452889771.0, + "step": 4235 + }, + { + "epoch": 9.64994298745724, + "grad_norm": 3.8125, + "learning_rate": 1.702579817036726e-08, + "loss": 0.5686, + "mean_token_accuracy": 0.8865519165992737, + "num_tokens": 452997568.0, + "step": 4236 + }, + { + "epoch": 9.652223489167618, + "grad_norm": 2.96875, + "learning_rate": 1.680706481446165e-08, + "loss": 0.5671, + "mean_token_accuracy": 0.8826638162136078, + "num_tokens": 453104638.0, + "step": 4237 + }, + { + "epoch": 9.654503990877993, + "grad_norm": 3.09375, + "learning_rate": 1.6589740870926186e-08, + "loss": 0.5501, + "mean_token_accuracy": 0.8905289322137833, + "num_tokens": 453211884.0, + "step": 4238 + }, + { + "epoch": 9.65678449258837, + "grad_norm": 2.640625, + "learning_rate": 1.6373826463109976e-08, + "loss": 0.5844, + "mean_token_accuracy": 0.8786414712667465, + "num_tokens": 453318930.0, + "step": 4239 + }, + { + "epoch": 9.659064994298745, + "grad_norm": 2.59375, + "learning_rate": 1.6159321713561382e-08, + "loss": 0.5901, + "mean_token_accuracy": 0.8797651678323746, + "num_tokens": 453426505.0, + "step": 4240 + }, + { + "epoch": 9.661345496009123, + "grad_norm": 3.203125, + "learning_rate": 1.5946226744029402e-08, + "loss": 0.5725, + "mean_token_accuracy": 0.8815406709909439, + "num_tokens": 453533705.0, + "step": 4241 + }, + { + "epoch": 9.663625997719498, + "grad_norm": 4.21875, + "learning_rate": 1.5734541675462567e-08, + "loss": 0.556, + "mean_token_accuracy": 0.8848689198493958, + "num_tokens": 453640428.0, + "step": 4242 + }, + { + "epoch": 9.665906499429875, + "grad_norm": 4.21875, + "learning_rate": 1.5524266628009212e-08, + "loss": 0.5868, + "mean_token_accuracy": 0.8785993754863739, + "num_tokens": 453747745.0, + "step": 4243 + }, + { + "epoch": 9.66818700114025, + "grad_norm": 3.140625, + "learning_rate": 1.5315401721017752e-08, + "loss": 0.5512, + "mean_token_accuracy": 0.8862568438053131, + "num_tokens": 453854721.0, + "step": 4244 + }, + { + "epoch": 9.670467502850627, + "grad_norm": 2.65625, + "learning_rate": 1.5107947073035312e-08, + "loss": 0.575, + "mean_token_accuracy": 0.8809828609228134, + "num_tokens": 453962100.0, + "step": 4245 + }, + { + "epoch": 9.672748004561003, + "grad_norm": 4.5, + "learning_rate": 1.4901902801809642e-08, + "loss": 0.5666, + "mean_token_accuracy": 0.8845654428005219, + "num_tokens": 454069089.0, + "step": 4246 + }, + { + "epoch": 9.67502850627138, + "grad_norm": 2.671875, + "learning_rate": 1.4697269024287198e-08, + "loss": 0.5652, + "mean_token_accuracy": 0.8847362250089645, + "num_tokens": 454176352.0, + "step": 4247 + }, + { + "epoch": 9.677309007981757, + "grad_norm": 3.9375, + "learning_rate": 1.4494045856613959e-08, + "loss": 0.5736, + "mean_token_accuracy": 0.8813314586877823, + "num_tokens": 454282952.0, + "step": 4248 + }, + { + "epoch": 9.679589509692132, + "grad_norm": 3.078125, + "learning_rate": 1.4292233414135992e-08, + "loss": 0.5811, + "mean_token_accuracy": 0.8810845017433167, + "num_tokens": 454390017.0, + "step": 4249 + }, + { + "epoch": 9.68187001140251, + "grad_norm": 3.765625, + "learning_rate": 1.4091831811397782e-08, + "loss": 0.5771, + "mean_token_accuracy": 0.8813609778881073, + "num_tokens": 454496557.0, + "step": 4250 + }, + { + "epoch": 9.684150513112884, + "grad_norm": 3.359375, + "learning_rate": 1.38928411621439e-08, + "loss": 0.576, + "mean_token_accuracy": 0.8821087181568146, + "num_tokens": 454603755.0, + "step": 4251 + }, + { + "epoch": 9.686431014823262, + "grad_norm": 5.5, + "learning_rate": 1.3695261579316776e-08, + "loss": 0.5867, + "mean_token_accuracy": 0.8810128718614578, + "num_tokens": 454710416.0, + "step": 4252 + }, + { + "epoch": 9.688711516533637, + "grad_norm": 2.546875, + "learning_rate": 1.3499093175059208e-08, + "loss": 0.5492, + "mean_token_accuracy": 0.8873603790998459, + "num_tokens": 454817321.0, + "step": 4253 + }, + { + "epoch": 9.690992018244014, + "grad_norm": 2.625, + "learning_rate": 1.3304336060712685e-08, + "loss": 0.5604, + "mean_token_accuracy": 0.8854569047689438, + "num_tokens": 454924489.0, + "step": 4254 + }, + { + "epoch": 9.69327251995439, + "grad_norm": 3.09375, + "learning_rate": 1.3110990346817676e-08, + "loss": 0.5641, + "mean_token_accuracy": 0.8837967067956924, + "num_tokens": 455031171.0, + "step": 4255 + }, + { + "epoch": 9.695553021664766, + "grad_norm": 2.84375, + "learning_rate": 1.2919056143113062e-08, + "loss": 0.5644, + "mean_token_accuracy": 0.8802383691072464, + "num_tokens": 455138285.0, + "step": 4256 + }, + { + "epoch": 9.697833523375142, + "grad_norm": 2.625, + "learning_rate": 1.2728533558537259e-08, + "loss": 0.5509, + "mean_token_accuracy": 0.8880133926868439, + "num_tokens": 455245413.0, + "step": 4257 + }, + { + "epoch": 9.700114025085519, + "grad_norm": 3.390625, + "learning_rate": 1.2539422701227099e-08, + "loss": 0.5777, + "mean_token_accuracy": 0.8805587142705917, + "num_tokens": 455352632.0, + "step": 4258 + }, + { + "epoch": 9.702394526795896, + "grad_norm": 2.921875, + "learning_rate": 1.235172367851839e-08, + "loss": 0.5548, + "mean_token_accuracy": 0.8849736303091049, + "num_tokens": 455460206.0, + "step": 4259 + }, + { + "epoch": 9.704675028506271, + "grad_norm": 5.21875, + "learning_rate": 1.2165436596945634e-08, + "loss": 0.5487, + "mean_token_accuracy": 0.8850021660327911, + "num_tokens": 455567328.0, + "step": 4260 + }, + { + "epoch": 9.706955530216648, + "grad_norm": 3.90625, + "learning_rate": 1.19805615622412e-08, + "loss": 0.5687, + "mean_token_accuracy": 0.8875507861375809, + "num_tokens": 455674739.0, + "step": 4261 + }, + { + "epoch": 9.709236031927023, + "grad_norm": 3.3125, + "learning_rate": 1.179709867933726e-08, + "loss": 0.5739, + "mean_token_accuracy": 0.8814339190721512, + "num_tokens": 455782280.0, + "step": 4262 + }, + { + "epoch": 9.7115165336374, + "grad_norm": 2.9375, + "learning_rate": 1.1615048052363298e-08, + "loss": 0.5736, + "mean_token_accuracy": 0.8808081150054932, + "num_tokens": 455888628.0, + "step": 4263 + }, + { + "epoch": 9.713797035347776, + "grad_norm": 3.25, + "learning_rate": 1.1434409784648049e-08, + "loss": 0.5585, + "mean_token_accuracy": 0.8849108666181564, + "num_tokens": 455996087.0, + "step": 4264 + }, + { + "epoch": 9.716077537058153, + "grad_norm": 2.8125, + "learning_rate": 1.125518397871811e-08, + "loss": 0.5839, + "mean_token_accuracy": 0.880156397819519, + "num_tokens": 456103096.0, + "step": 4265 + }, + { + "epoch": 9.718358038768528, + "grad_norm": 3.1875, + "learning_rate": 1.1077370736298498e-08, + "loss": 0.575, + "mean_token_accuracy": 0.8832675367593765, + "num_tokens": 456209824.0, + "step": 4266 + }, + { + "epoch": 9.720638540478905, + "grad_norm": 2.578125, + "learning_rate": 1.090097015831293e-08, + "loss": 0.5706, + "mean_token_accuracy": 0.8845285028219223, + "num_tokens": 456317318.0, + "step": 4267 + }, + { + "epoch": 9.722919042189282, + "grad_norm": 2.90625, + "learning_rate": 1.0725982344882701e-08, + "loss": 0.5922, + "mean_token_accuracy": 0.8807027041912079, + "num_tokens": 456424411.0, + "step": 4268 + }, + { + "epoch": 9.725199543899658, + "grad_norm": 3.0, + "learning_rate": 1.0552407395327813e-08, + "loss": 0.5872, + "mean_token_accuracy": 0.8796821981668472, + "num_tokens": 456531485.0, + "step": 4269 + }, + { + "epoch": 9.727480045610035, + "grad_norm": 3.84375, + "learning_rate": 1.0380245408165846e-08, + "loss": 0.5654, + "mean_token_accuracy": 0.8859843015670776, + "num_tokens": 456638619.0, + "step": 4270 + }, + { + "epoch": 9.72976054732041, + "grad_norm": 3.296875, + "learning_rate": 1.0209496481112247e-08, + "loss": 0.5732, + "mean_token_accuracy": 0.8768668621778488, + "num_tokens": 456745727.0, + "step": 4271 + }, + { + "epoch": 9.732041049030787, + "grad_norm": 3.125, + "learning_rate": 1.0040160711081437e-08, + "loss": 0.5719, + "mean_token_accuracy": 0.8817434310913086, + "num_tokens": 456852873.0, + "step": 4272 + }, + { + "epoch": 9.734321550741162, + "grad_norm": 2.609375, + "learning_rate": 9.87223819418487e-09, + "loss": 0.5612, + "mean_token_accuracy": 0.8846335262060165, + "num_tokens": 456960828.0, + "step": 4273 + }, + { + "epoch": 9.73660205245154, + "grad_norm": 4.84375, + "learning_rate": 9.705729025732135e-09, + "loss": 0.5523, + "mean_token_accuracy": 0.8860846012830734, + "num_tokens": 457067866.0, + "step": 4274 + }, + { + "epoch": 9.738882554161915, + "grad_norm": 3.90625, + "learning_rate": 9.540633300230418e-09, + "loss": 0.579, + "mean_token_accuracy": 0.8812527358531952, + "num_tokens": 457174777.0, + "step": 4275 + }, + { + "epoch": 9.741163055872292, + "grad_norm": 3.0625, + "learning_rate": 9.376951111385313e-09, + "loss": 0.5482, + "mean_token_accuracy": 0.8858974128961563, + "num_tokens": 457282370.0, + "step": 4276 + }, + { + "epoch": 9.743443557582669, + "grad_norm": 2.765625, + "learning_rate": 9.214682552099175e-09, + "loss": 0.5803, + "mean_token_accuracy": 0.8799700736999512, + "num_tokens": 457389651.0, + "step": 4277 + }, + { + "epoch": 9.745724059293044, + "grad_norm": 2.90625, + "learning_rate": 9.053827714472773e-09, + "loss": 0.5549, + "mean_token_accuracy": 0.8852715194225311, + "num_tokens": 457496887.0, + "step": 4278 + }, + { + "epoch": 9.748004561003421, + "grad_norm": 2.8125, + "learning_rate": 8.894386689804469e-09, + "loss": 0.5617, + "mean_token_accuracy": 0.883760392665863, + "num_tokens": 457603521.0, + "step": 4279 + }, + { + "epoch": 9.750285062713797, + "grad_norm": 3.078125, + "learning_rate": 8.73635956858937e-09, + "loss": 0.5729, + "mean_token_accuracy": 0.8828184455633163, + "num_tokens": 457710089.0, + "step": 4280 + }, + { + "epoch": 9.752565564424174, + "grad_norm": 2.84375, + "learning_rate": 8.579746440520731e-09, + "loss": 0.5782, + "mean_token_accuracy": 0.8836691379547119, + "num_tokens": 457817982.0, + "step": 4281 + }, + { + "epoch": 9.754846066134549, + "grad_norm": 2.65625, + "learning_rate": 8.424547394489668e-09, + "loss": 0.5624, + "mean_token_accuracy": 0.883584663271904, + "num_tokens": 457925306.0, + "step": 4282 + }, + { + "epoch": 9.757126567844926, + "grad_norm": 3.5625, + "learning_rate": 8.270762518583498e-09, + "loss": 0.5763, + "mean_token_accuracy": 0.8828044384717941, + "num_tokens": 458032364.0, + "step": 4283 + }, + { + "epoch": 9.759407069555301, + "grad_norm": 2.921875, + "learning_rate": 8.118391900087952e-09, + "loss": 0.5401, + "mean_token_accuracy": 0.8894132673740387, + "num_tokens": 458139150.0, + "step": 4284 + }, + { + "epoch": 9.761687571265679, + "grad_norm": 3.828125, + "learning_rate": 7.967435625485242e-09, + "loss": 0.5614, + "mean_token_accuracy": 0.8839370012283325, + "num_tokens": 458246236.0, + "step": 4285 + }, + { + "epoch": 9.763968072976056, + "grad_norm": 4.0625, + "learning_rate": 7.81789378045572e-09, + "loss": 0.555, + "mean_token_accuracy": 0.8877554684877396, + "num_tokens": 458353277.0, + "step": 4286 + }, + { + "epoch": 9.766248574686431, + "grad_norm": 2.796875, + "learning_rate": 7.669766449876493e-09, + "loss": 0.5718, + "mean_token_accuracy": 0.8832697570323944, + "num_tokens": 458459980.0, + "step": 4287 + }, + { + "epoch": 9.768529076396808, + "grad_norm": 3.578125, + "learning_rate": 7.523053717821138e-09, + "loss": 0.57, + "mean_token_accuracy": 0.8807340711355209, + "num_tokens": 458566519.0, + "step": 4288 + }, + { + "epoch": 9.770809578107183, + "grad_norm": 3.703125, + "learning_rate": 7.377755667561659e-09, + "loss": 0.5684, + "mean_token_accuracy": 0.8821778148412704, + "num_tokens": 458673827.0, + "step": 4289 + }, + { + "epoch": 9.77309007981756, + "grad_norm": 3.921875, + "learning_rate": 7.233872381565976e-09, + "loss": 0.5793, + "mean_token_accuracy": 0.8782579749822617, + "num_tokens": 458780746.0, + "step": 4290 + }, + { + "epoch": 9.775370581527936, + "grad_norm": 2.9375, + "learning_rate": 7.091403941499597e-09, + "loss": 0.556, + "mean_token_accuracy": 0.885596290230751, + "num_tokens": 458887534.0, + "step": 4291 + }, + { + "epoch": 9.777651083238313, + "grad_norm": 3.0, + "learning_rate": 6.950350428225061e-09, + "loss": 0.5504, + "mean_token_accuracy": 0.8886802643537521, + "num_tokens": 458994895.0, + "step": 4292 + }, + { + "epoch": 9.779931584948688, + "grad_norm": 3.09375, + "learning_rate": 6.810711921801105e-09, + "loss": 0.5644, + "mean_token_accuracy": 0.8828777819871902, + "num_tokens": 459102180.0, + "step": 4293 + }, + { + "epoch": 9.782212086659065, + "grad_norm": 3.34375, + "learning_rate": 6.672488501484608e-09, + "loss": 0.5629, + "mean_token_accuracy": 0.8822075426578522, + "num_tokens": 459209744.0, + "step": 4294 + }, + { + "epoch": 9.78449258836944, + "grad_norm": 3.765625, + "learning_rate": 6.535680245727816e-09, + "loss": 0.556, + "mean_token_accuracy": 0.8854386210441589, + "num_tokens": 459317224.0, + "step": 4295 + }, + { + "epoch": 9.786773090079818, + "grad_norm": 2.828125, + "learning_rate": 6.400287232180558e-09, + "loss": 0.5752, + "mean_token_accuracy": 0.8836027830839157, + "num_tokens": 459424913.0, + "step": 4296 + }, + { + "epoch": 9.789053591790195, + "grad_norm": 2.828125, + "learning_rate": 6.266309537689696e-09, + "loss": 0.5751, + "mean_token_accuracy": 0.8846608698368073, + "num_tokens": 459531883.0, + "step": 4297 + }, + { + "epoch": 9.79133409350057, + "grad_norm": 4.5, + "learning_rate": 6.133747238298016e-09, + "loss": 0.5916, + "mean_token_accuracy": 0.8778994530439377, + "num_tokens": 459638404.0, + "step": 4298 + }, + { + "epoch": 9.793614595210947, + "grad_norm": 2.65625, + "learning_rate": 6.002600409245607e-09, + "loss": 0.5522, + "mean_token_accuracy": 0.8856344819068909, + "num_tokens": 459746677.0, + "step": 4299 + }, + { + "epoch": 9.795895096921322, + "grad_norm": 4.03125, + "learning_rate": 5.872869124968761e-09, + "loss": 0.5939, + "mean_token_accuracy": 0.87481589615345, + "num_tokens": 459853457.0, + "step": 4300 + }, + { + "epoch": 9.7981755986317, + "grad_norm": 2.8125, + "learning_rate": 5.7445534591002435e-09, + "loss": 0.5763, + "mean_token_accuracy": 0.8805107921361923, + "num_tokens": 459960464.0, + "step": 4301 + }, + { + "epoch": 9.800456100342075, + "grad_norm": 2.71875, + "learning_rate": 5.617653484469576e-09, + "loss": 0.5489, + "mean_token_accuracy": 0.885638564825058, + "num_tokens": 460067123.0, + "step": 4302 + }, + { + "epoch": 9.802736602052452, + "grad_norm": 6.28125, + "learning_rate": 5.492169273103309e-09, + "loss": 0.5632, + "mean_token_accuracy": 0.884444460272789, + "num_tokens": 460173895.0, + "step": 4303 + }, + { + "epoch": 9.805017103762827, + "grad_norm": 3.171875, + "learning_rate": 5.368100896223083e-09, + "loss": 0.5775, + "mean_token_accuracy": 0.8823024779558182, + "num_tokens": 460280558.0, + "step": 4304 + }, + { + "epoch": 9.807297605473204, + "grad_norm": 3.640625, + "learning_rate": 5.245448424248123e-09, + "loss": 0.5707, + "mean_token_accuracy": 0.885224923491478, + "num_tokens": 460387236.0, + "step": 4305 + }, + { + "epoch": 9.80957810718358, + "grad_norm": 2.96875, + "learning_rate": 5.124211926793577e-09, + "loss": 0.5659, + "mean_token_accuracy": 0.8848960250616074, + "num_tokens": 460493654.0, + "step": 4306 + }, + { + "epoch": 9.811858608893957, + "grad_norm": 2.796875, + "learning_rate": 5.004391472670788e-09, + "loss": 0.5544, + "mean_token_accuracy": 0.8836629986763, + "num_tokens": 460600975.0, + "step": 4307 + }, + { + "epoch": 9.814139110604334, + "grad_norm": 3.53125, + "learning_rate": 4.885987129887859e-09, + "loss": 0.5956, + "mean_token_accuracy": 0.8799479305744171, + "num_tokens": 460707729.0, + "step": 4308 + }, + { + "epoch": 9.816419612314709, + "grad_norm": 2.6875, + "learning_rate": 4.768998965648253e-09, + "loss": 0.591, + "mean_token_accuracy": 0.8796339482069016, + "num_tokens": 460814870.0, + "step": 4309 + }, + { + "epoch": 9.818700114025086, + "grad_norm": 3.15625, + "learning_rate": 4.653427046352743e-09, + "loss": 0.5804, + "mean_token_accuracy": 0.8791462630033493, + "num_tokens": 460921892.0, + "step": 4310 + }, + { + "epoch": 9.820980615735461, + "grad_norm": 3.09375, + "learning_rate": 4.53927143759747e-09, + "loss": 0.5772, + "mean_token_accuracy": 0.8801241964101791, + "num_tokens": 461029101.0, + "step": 4311 + }, + { + "epoch": 9.823261117445838, + "grad_norm": 2.484375, + "learning_rate": 4.426532204175049e-09, + "loss": 0.5596, + "mean_token_accuracy": 0.8861949443817139, + "num_tokens": 461136681.0, + "step": 4312 + }, + { + "epoch": 9.825541619156214, + "grad_norm": 2.703125, + "learning_rate": 4.3152094100740175e-09, + "loss": 0.5629, + "mean_token_accuracy": 0.8844578862190247, + "num_tokens": 461244074.0, + "step": 4313 + }, + { + "epoch": 9.82782212086659, + "grad_norm": 2.890625, + "learning_rate": 4.205303118479109e-09, + "loss": 0.5693, + "mean_token_accuracy": 0.8827894330024719, + "num_tokens": 461351000.0, + "step": 4314 + }, + { + "epoch": 9.830102622576966, + "grad_norm": 2.71875, + "learning_rate": 4.096813391770982e-09, + "loss": 0.5606, + "mean_token_accuracy": 0.8869377076625824, + "num_tokens": 461458568.0, + "step": 4315 + }, + { + "epoch": 9.832383124287343, + "grad_norm": 3.234375, + "learning_rate": 3.989740291526212e-09, + "loss": 0.5671, + "mean_token_accuracy": 0.8829131424427032, + "num_tokens": 461565463.0, + "step": 4316 + }, + { + "epoch": 9.83466362599772, + "grad_norm": 3.046875, + "learning_rate": 3.884083878517575e-09, + "loss": 0.5668, + "mean_token_accuracy": 0.8803281784057617, + "num_tokens": 461672563.0, + "step": 4317 + }, + { + "epoch": 9.836944127708096, + "grad_norm": 3.71875, + "learning_rate": 3.779844212713213e-09, + "loss": 0.5542, + "mean_token_accuracy": 0.8861279636621475, + "num_tokens": 461779451.0, + "step": 4318 + }, + { + "epoch": 9.839224629418473, + "grad_norm": 2.734375, + "learning_rate": 3.6770213532782985e-09, + "loss": 0.5639, + "mean_token_accuracy": 0.8874040246009827, + "num_tokens": 461886457.0, + "step": 4319 + }, + { + "epoch": 9.841505131128848, + "grad_norm": 3.5, + "learning_rate": 3.5756153585725374e-09, + "loss": 0.5524, + "mean_token_accuracy": 0.8833809942007065, + "num_tokens": 461993587.0, + "step": 4320 + }, + { + "epoch": 9.843785632839225, + "grad_norm": 3.03125, + "learning_rate": 3.475626286152112e-09, + "loss": 0.5725, + "mean_token_accuracy": 0.8821329474449158, + "num_tokens": 462100785.0, + "step": 4321 + }, + { + "epoch": 9.8460661345496, + "grad_norm": 2.65625, + "learning_rate": 3.3770541927691247e-09, + "loss": 0.5641, + "mean_token_accuracy": 0.8800762295722961, + "num_tokens": 462207714.0, + "step": 4322 + }, + { + "epoch": 9.848346636259977, + "grad_norm": 2.671875, + "learning_rate": 3.2798991343707676e-09, + "loss": 0.5559, + "mean_token_accuracy": 0.8831606060266495, + "num_tokens": 462314885.0, + "step": 4323 + }, + { + "epoch": 9.850627137970353, + "grad_norm": 3.921875, + "learning_rate": 3.1841611661007077e-09, + "loss": 0.5608, + "mean_token_accuracy": 0.8826912045478821, + "num_tokens": 462422224.0, + "step": 4324 + }, + { + "epoch": 9.85290763968073, + "grad_norm": 3.1875, + "learning_rate": 3.089840342297701e-09, + "loss": 0.5771, + "mean_token_accuracy": 0.8812664598226547, + "num_tokens": 462529435.0, + "step": 4325 + }, + { + "epoch": 9.855188141391107, + "grad_norm": 3.75, + "learning_rate": 2.9969367164969787e-09, + "loss": 0.5616, + "mean_token_accuracy": 0.8857505470514297, + "num_tokens": 462636769.0, + "step": 4326 + }, + { + "epoch": 9.857468643101482, + "grad_norm": 3.484375, + "learning_rate": 2.905450341428029e-09, + "loss": 0.5718, + "mean_token_accuracy": 0.8818775713443756, + "num_tokens": 462743978.0, + "step": 4327 + }, + { + "epoch": 9.85974914481186, + "grad_norm": 3.421875, + "learning_rate": 2.8153812690173697e-09, + "loss": 0.5767, + "mean_token_accuracy": 0.8814905136823654, + "num_tokens": 462850538.0, + "step": 4328 + }, + { + "epoch": 9.862029646522235, + "grad_norm": 2.625, + "learning_rate": 2.726729550386331e-09, + "loss": 0.5719, + "mean_token_accuracy": 0.8831851780414581, + "num_tokens": 462957437.0, + "step": 4329 + }, + { + "epoch": 9.864310148232612, + "grad_norm": 3.15625, + "learning_rate": 2.6394952358518854e-09, + "loss": 0.5804, + "mean_token_accuracy": 0.8794573098421097, + "num_tokens": 463063901.0, + "step": 4330 + }, + { + "epoch": 9.866590649942987, + "grad_norm": 2.875, + "learning_rate": 2.553678374926649e-09, + "loss": 0.5617, + "mean_token_accuracy": 0.8824156671762466, + "num_tokens": 463171455.0, + "step": 4331 + }, + { + "epoch": 9.868871151653364, + "grad_norm": 3.3125, + "learning_rate": 2.4692790163183268e-09, + "loss": 0.5688, + "mean_token_accuracy": 0.8811941742897034, + "num_tokens": 463278442.0, + "step": 4332 + }, + { + "epoch": 9.87115165336374, + "grad_norm": 3.796875, + "learning_rate": 2.3862972079305435e-09, + "loss": 0.5559, + "mean_token_accuracy": 0.8832034319639206, + "num_tokens": 463385401.0, + "step": 4333 + }, + { + "epoch": 9.873432155074116, + "grad_norm": 3.90625, + "learning_rate": 2.3047329968620137e-09, + "loss": 0.5627, + "mean_token_accuracy": 0.8862831592559814, + "num_tokens": 463492423.0, + "step": 4334 + }, + { + "epoch": 9.875712656784494, + "grad_norm": 3.0, + "learning_rate": 2.2245864294073715e-09, + "loss": 0.5696, + "mean_token_accuracy": 0.8842545598745346, + "num_tokens": 463599395.0, + "step": 4335 + }, + { + "epoch": 9.877993158494869, + "grad_norm": 2.875, + "learning_rate": 2.145857551056063e-09, + "loss": 0.5805, + "mean_token_accuracy": 0.8817007839679718, + "num_tokens": 463707615.0, + "step": 4336 + }, + { + "epoch": 9.880273660205246, + "grad_norm": 2.796875, + "learning_rate": 2.0685464064928996e-09, + "loss": 0.5538, + "mean_token_accuracy": 0.8855222314596176, + "num_tokens": 463814300.0, + "step": 4337 + }, + { + "epoch": 9.882554161915621, + "grad_norm": 3.03125, + "learning_rate": 1.992653039598613e-09, + "loss": 0.5621, + "mean_token_accuracy": 0.8810195475816727, + "num_tokens": 463921566.0, + "step": 4338 + }, + { + "epoch": 9.884834663625998, + "grad_norm": 3.234375, + "learning_rate": 1.91817749344847e-09, + "loss": 0.5934, + "mean_token_accuracy": 0.8760579079389572, + "num_tokens": 464028254.0, + "step": 4339 + }, + { + "epoch": 9.887115165336374, + "grad_norm": 2.5, + "learning_rate": 1.8451198103133783e-09, + "loss": 0.5684, + "mean_token_accuracy": 0.8798963725566864, + "num_tokens": 464135330.0, + "step": 4340 + }, + { + "epoch": 9.88939566704675, + "grad_norm": 4.78125, + "learning_rate": 1.7734800316596135e-09, + "loss": 0.577, + "mean_token_accuracy": 0.8792663961648941, + "num_tokens": 464242517.0, + "step": 4341 + }, + { + "epoch": 9.891676168757126, + "grad_norm": 2.953125, + "learning_rate": 1.703258198148261e-09, + "loss": 0.5634, + "mean_token_accuracy": 0.884123831987381, + "num_tokens": 464349209.0, + "step": 4342 + }, + { + "epoch": 9.893956670467503, + "grad_norm": 3.703125, + "learning_rate": 1.6344543496360499e-09, + "loss": 0.5629, + "mean_token_accuracy": 0.8826566338539124, + "num_tokens": 464456526.0, + "step": 4343 + }, + { + "epoch": 9.896237172177878, + "grad_norm": 2.765625, + "learning_rate": 1.567068525175075e-09, + "loss": 0.5671, + "mean_token_accuracy": 0.887358620762825, + "num_tokens": 464563578.0, + "step": 4344 + }, + { + "epoch": 9.898517673888255, + "grad_norm": 3.625, + "learning_rate": 1.5011007630114093e-09, + "loss": 0.5738, + "mean_token_accuracy": 0.8806861042976379, + "num_tokens": 464670481.0, + "step": 4345 + }, + { + "epoch": 9.900798175598633, + "grad_norm": 2.8125, + "learning_rate": 1.4365511005878796e-09, + "loss": 0.5689, + "mean_token_accuracy": 0.8842573165893555, + "num_tokens": 464778377.0, + "step": 4346 + }, + { + "epoch": 9.903078677309008, + "grad_norm": 3.796875, + "learning_rate": 1.3734195745412904e-09, + "loss": 0.5739, + "mean_token_accuracy": 0.8824024051427841, + "num_tokens": 464885701.0, + "step": 4347 + }, + { + "epoch": 9.905359179019385, + "grad_norm": 3.625, + "learning_rate": 1.3117062207038123e-09, + "loss": 0.577, + "mean_token_accuracy": 0.880613699555397, + "num_tokens": 464992795.0, + "step": 4348 + }, + { + "epoch": 9.90763968072976, + "grad_norm": 3.53125, + "learning_rate": 1.2514110741029816e-09, + "loss": 0.5741, + "mean_token_accuracy": 0.8834188729524612, + "num_tokens": 465099950.0, + "step": 4349 + }, + { + "epoch": 9.909920182440137, + "grad_norm": 2.734375, + "learning_rate": 1.1925341689608682e-09, + "loss": 0.5755, + "mean_token_accuracy": 0.8817101418972015, + "num_tokens": 465207553.0, + "step": 4350 + }, + { + "epoch": 9.912200684150513, + "grad_norm": 3.59375, + "learning_rate": 1.135075538695185e-09, + "loss": 0.5977, + "mean_token_accuracy": 0.8769432902336121, + "num_tokens": 465314057.0, + "step": 4351 + }, + { + "epoch": 9.91448118586089, + "grad_norm": 3.875, + "learning_rate": 1.0790352159179007e-09, + "loss": 0.5812, + "mean_token_accuracy": 0.8812268972396851, + "num_tokens": 465420891.0, + "step": 4352 + }, + { + "epoch": 9.916761687571265, + "grad_norm": 3.6875, + "learning_rate": 1.024413232436905e-09, + "loss": 0.5762, + "mean_token_accuracy": 0.878851979970932, + "num_tokens": 465527864.0, + "step": 4353 + }, + { + "epoch": 9.919042189281642, + "grad_norm": 3.09375, + "learning_rate": 9.71209619254343e-10, + "loss": 0.5747, + "mean_token_accuracy": 0.8795945793390274, + "num_tokens": 465634366.0, + "step": 4354 + }, + { + "epoch": 9.921322690992017, + "grad_norm": 3.390625, + "learning_rate": 9.194244065674484e-10, + "loss": 0.5829, + "mean_token_accuracy": 0.8763747960329056, + "num_tokens": 465741337.0, + "step": 4355 + }, + { + "epoch": 9.923603192702394, + "grad_norm": 3.0, + "learning_rate": 8.690576237688208e-10, + "loss": 0.573, + "mean_token_accuracy": 0.8846324980258942, + "num_tokens": 465848967.0, + "step": 4356 + }, + { + "epoch": 9.925883694412772, + "grad_norm": 2.609375, + "learning_rate": 8.201092994453153e-10, + "loss": 0.5799, + "mean_token_accuracy": 0.879865899682045, + "num_tokens": 465955628.0, + "step": 4357 + }, + { + "epoch": 9.928164196123147, + "grad_norm": 2.953125, + "learning_rate": 7.725794613791527e-10, + "loss": 0.5724, + "mean_token_accuracy": 0.8813536763191223, + "num_tokens": 466062877.0, + "step": 4358 + }, + { + "epoch": 9.930444697833524, + "grad_norm": 3.75, + "learning_rate": 7.264681365476422e-10, + "loss": 0.5683, + "mean_token_accuracy": 0.8843436688184738, + "num_tokens": 466170580.0, + "step": 4359 + }, + { + "epoch": 9.9327251995439, + "grad_norm": 3.125, + "learning_rate": 6.817753511226266e-10, + "loss": 0.5662, + "mean_token_accuracy": 0.8817441612482071, + "num_tokens": 466277744.0, + "step": 4360 + }, + { + "epoch": 9.935005701254276, + "grad_norm": 2.375, + "learning_rate": 6.385011304704814e-10, + "loss": 0.5443, + "mean_token_accuracy": 0.8887474536895752, + "num_tokens": 466384776.0, + "step": 4361 + }, + { + "epoch": 9.937286202964652, + "grad_norm": 2.875, + "learning_rate": 5.96645499152948e-10, + "loss": 0.5935, + "mean_token_accuracy": 0.8781139552593231, + "num_tokens": 466491782.0, + "step": 4362 + }, + { + "epoch": 9.939566704675029, + "grad_norm": 2.78125, + "learning_rate": 5.562084809268564e-10, + "loss": 0.5561, + "mean_token_accuracy": 0.8866285234689713, + "num_tokens": 466598743.0, + "step": 4363 + }, + { + "epoch": 9.941847206385404, + "grad_norm": 2.6875, + "learning_rate": 5.171900987430146e-10, + "loss": 0.5625, + "mean_token_accuracy": 0.8828070014715195, + "num_tokens": 466705736.0, + "step": 4364 + }, + { + "epoch": 9.944127708095781, + "grad_norm": 2.78125, + "learning_rate": 4.795903747475961e-10, + "loss": 0.5786, + "mean_token_accuracy": 0.8803279101848602, + "num_tokens": 466812755.0, + "step": 4365 + }, + { + "epoch": 9.946408209806158, + "grad_norm": 2.734375, + "learning_rate": 4.434093302815856e-10, + "loss": 0.5639, + "mean_token_accuracy": 0.8847870975732803, + "num_tokens": 466919779.0, + "step": 4366 + }, + { + "epoch": 9.948688711516533, + "grad_norm": 2.75, + "learning_rate": 4.0864698588077844e-10, + "loss": 0.5774, + "mean_token_accuracy": 0.883985847234726, + "num_tokens": 467026369.0, + "step": 4367 + }, + { + "epoch": 9.95096921322691, + "grad_norm": 3.09375, + "learning_rate": 3.7530336127550306e-10, + "loss": 0.6001, + "mean_token_accuracy": 0.8794097006320953, + "num_tokens": 467133105.0, + "step": 4368 + }, + { + "epoch": 9.953249714937286, + "grad_norm": 3.21875, + "learning_rate": 3.4337847539089866e-10, + "loss": 0.5515, + "mean_token_accuracy": 0.8836051672697067, + "num_tokens": 467240349.0, + "step": 4369 + }, + { + "epoch": 9.955530216647663, + "grad_norm": 3.125, + "learning_rate": 3.1287234634663766e-10, + "loss": 0.5822, + "mean_token_accuracy": 0.8793405294418335, + "num_tokens": 467347282.0, + "step": 4370 + }, + { + "epoch": 9.957810718358038, + "grad_norm": 2.71875, + "learning_rate": 2.8378499145803593e-10, + "loss": 0.5576, + "mean_token_accuracy": 0.8886418640613556, + "num_tokens": 467454418.0, + "step": 4371 + }, + { + "epoch": 9.960091220068415, + "grad_norm": 4.1875, + "learning_rate": 2.5611642723410987e-10, + "loss": 0.5857, + "mean_token_accuracy": 0.8817699551582336, + "num_tokens": 467561394.0, + "step": 4372 + }, + { + "epoch": 9.96237172177879, + "grad_norm": 3.890625, + "learning_rate": 2.2986666937896418e-10, + "loss": 0.5689, + "mean_token_accuracy": 0.8804190456867218, + "num_tokens": 467668306.0, + "step": 4373 + }, + { + "epoch": 9.964652223489168, + "grad_norm": 2.875, + "learning_rate": 2.050357327917918e-10, + "loss": 0.5541, + "mean_token_accuracy": 0.8814870566129684, + "num_tokens": 467775667.0, + "step": 4374 + }, + { + "epoch": 9.966932725199545, + "grad_norm": 2.515625, + "learning_rate": 1.816236315657638e-10, + "loss": 0.5719, + "mean_token_accuracy": 0.8799229115247726, + "num_tokens": 467883505.0, + "step": 4375 + }, + { + "epoch": 9.96921322690992, + "grad_norm": 2.875, + "learning_rate": 1.5963037898913957e-10, + "loss": 0.5626, + "mean_token_accuracy": 0.8831915408372879, + "num_tokens": 467990987.0, + "step": 4376 + }, + { + "epoch": 9.971493728620297, + "grad_norm": 2.796875, + "learning_rate": 1.3905598754526684e-10, + "loss": 0.5844, + "mean_token_accuracy": 0.8801503032445908, + "num_tokens": 468098922.0, + "step": 4377 + }, + { + "epoch": 9.973774230330672, + "grad_norm": 2.859375, + "learning_rate": 1.1990046891147133e-10, + "loss": 0.5554, + "mean_token_accuracy": 0.8852853924036026, + "num_tokens": 468206794.0, + "step": 4378 + }, + { + "epoch": 9.97605473204105, + "grad_norm": 2.625, + "learning_rate": 1.021638339598896e-10, + "loss": 0.5712, + "mean_token_accuracy": 0.8826959729194641, + "num_tokens": 468313770.0, + "step": 4379 + }, + { + "epoch": 9.978335233751425, + "grad_norm": 3.421875, + "learning_rate": 8.584609275802402e-11, + "loss": 0.5628, + "mean_token_accuracy": 0.8853756189346313, + "num_tokens": 468420750.0, + "step": 4380 + }, + { + "epoch": 9.980615735461802, + "grad_norm": 2.921875, + "learning_rate": 7.094725456707752e-11, + "loss": 0.5398, + "mean_token_accuracy": 0.8900392800569534, + "num_tokens": 468527796.0, + "step": 4381 + }, + { + "epoch": 9.982896237172177, + "grad_norm": 3.21875, + "learning_rate": 5.746732784361886e-11, + "loss": 0.5661, + "mean_token_accuracy": 0.8818590939044952, + "num_tokens": 468634501.0, + "step": 4382 + }, + { + "epoch": 9.985176738882554, + "grad_norm": 2.921875, + "learning_rate": 4.540632023819491e-11, + "loss": 0.5673, + "mean_token_accuracy": 0.8829274624586105, + "num_tokens": 468741744.0, + "step": 4383 + }, + { + "epoch": 9.987457240592931, + "grad_norm": 3.09375, + "learning_rate": 3.47642385967184e-11, + "loss": 0.5709, + "mean_token_accuracy": 0.8815714865922928, + "num_tokens": 468848526.0, + "step": 4384 + }, + { + "epoch": 9.989737742303307, + "grad_norm": 2.84375, + "learning_rate": 2.5541088959357697e-11, + "loss": 0.5437, + "mean_token_accuracy": 0.8854826241731644, + "num_tokens": 468955714.0, + "step": 4385 + }, + { + "epoch": 9.992018244013684, + "grad_norm": 2.90625, + "learning_rate": 1.773687656109191e-11, + "loss": 0.5839, + "mean_token_accuracy": 0.8776491433382034, + "num_tokens": 469062395.0, + "step": 4386 + }, + { + "epoch": 9.994298745724059, + "grad_norm": 3.671875, + "learning_rate": 1.1351605831433354e-11, + "loss": 0.5663, + "mean_token_accuracy": 0.8828050196170807, + "num_tokens": 469169221.0, + "step": 4387 + }, + { + "epoch": 9.996579247434436, + "grad_norm": 4.84375, + "learning_rate": 6.385280394149984e-12, + "loss": 0.5735, + "mean_token_accuracy": 0.8813241869211197, + "num_tokens": 469276076.0, + "step": 4388 + }, + { + "epoch": 9.998859749144811, + "grad_norm": 4.5625, + "learning_rate": 2.8379030686531696e-12, + "loss": 0.5724, + "mean_token_accuracy": 0.8789683431386948, + "num_tokens": 469383376.0, + "step": 4389 + }, + { + "epoch": 10.0, + "grad_norm": 4.90625, + "learning_rate": 7.094758677772539e-13, + "loss": 0.578, + "mean_token_accuracy": 0.8766284584999084, + "num_tokens": 469422320.0, + "step": 4390 + } + ], + "logging_steps": 1, + "max_steps": 4390, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.252059023052636e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}